[X86] Reuse EFLAGS and form LOCKed ops when only user is SETCC.

We only generate LOCKed versions of add/sub when the result is unused. It often happens that the result is used, but only by a comparison. We can optimize those out by reusing EFLAGS, which lets us use the proper instructions, instead of having to fallback to LXADD. Instead of doing this as an MI peephole (as we do for the other non-LOCKed (really, non-MR) forms), do it in ISel. It becomes quite tricky later. This also makes it eventually possible to stop expanding and/or/xor if the only user is an icmp (also see D18141). This uses the LOCK ISD opcodes added by r262244. Differential Revision: http://reviews.llvm.org/D17633 llvm-svn: 265450
2024-11-24 03:33:20 +01:00 · 2016-04-05 20:02:57 +00:00 · 2016-04-05 20:02:57 +00:00 · 6921e4df62
commit 6921e4df62
parent 1d62b8004b
2 changed files with 81 additions and 30 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -26134,6 +26134,56 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
  return SDValue();
 }

+/// Combine:
+///   (brcond/cmov/setcc .., (cmp (atomic_load_op ..), 0), cc)
+/// to:
+///   (brcond/cmov/setcc .., (LOCKed op ..), cc)
+/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
+/// Note that this is only legal for some op/cc combinations.
+static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode CC,
+                                       SelectionDAG &DAG) {
+  // This combine only operates on CMP-like nodes.
+  if (!(Cmp.getOpcode() == X86ISD::CMP ||
+        (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
+    return SDValue();
+
+  SDValue LHS = Cmp.getOperand(0);
+  SDValue RHS = Cmp.getOperand(1);
+
+  if (!LHS.hasOneUse())
+    return SDValue();
+
+  // FIXME: We can do this for XOR/OR/AND as well, but only if they survive
+  // AtomicExpand. Currently, we choose to expand them to cmpxchg if they
+  // have any users. Could we relax that to ignore (icmp x,0) users?
+  switch (LHS->getOpcode()) {
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+    break;
+  default:
+    return SDValue();
+  }
+
+  auto *C = dyn_cast<ConstantSDNode>(RHS);
+  if (!C || C->getZExtValue() != 0)
+    return SDValue();
+
+  // Don't do this for all condition codes, as OF/CF are cleared by (CMP x,0)
+  // but might be set by arithmetic. Furthermore, we might later select INC/DEC,
+  // which don't modify CF (though CCs using CF should have been optimized out).
+  // SF/ZF are safe as they are set the same way.
+  // Note that in theory, the transformation is also valid for P/NP.
+  if (CC != X86::COND_E && CC != X86::COND_NE && CC != X86::COND_S &&
+      CC != X86::COND_NS)
+    return SDValue();
+
+  SDValue LockOp = lowerAtomicArithWithLOCK(LHS, DAG);
+  DAG.ReplaceAllUsesOfValueWith(LHS.getValue(0),
+                                DAG.getUNDEF(LHS.getValueType()));
+  DAG.ReplaceAllUsesOfValueWith(LHS.getValue(1), LockOp.getValue(1));
+  return LockOp;
+}
+
 // Check whether a boolean test is testing a boolean value generated by
 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
 // code.
@ -26305,6 +26355,16 @@ static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
  return true;
 }

+/// Optimize an EFLAGS definition used according to the condition code \p CC
+/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
+/// uses of chain values.
+static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
+                                  SelectionDAG &DAG) {
+  if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
+    return R;
+  return combineSetCCAtomicArith(EFLAGS, CC, DAG);
+}
+
 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
@ -26331,15 +26391,14 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
    }
  }

-  SDValue Flags;
-
-  Flags = checkBoolTestSetCCCombine(Cond, CC);
-  if (Flags.getNode() &&
-      // Extra check as FCMOV only supports a subset of X86 cond.
-      (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
-    SDValue Ops[] = { FalseOp, TrueOp,
-                      DAG.getConstant(CC, DL, MVT::i8), Flags };
-    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
+  // Try to simplify the EFLAGS and condition code operands.
+  // We can't always do this as FCMOV only supports a subset of X86 cond.
+  if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
+    if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
+      SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
+        Flags};
+      return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
+    }
  }

  // If this is a select between two integer constants, try to do some
@ -29265,7 +29324,8 @@ static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
  if (CC == X86::COND_B)
    return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));

-  if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) {
+  // Try to simplify the EFLAGS and condition code operands.
+  if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
    SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
    return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
  }
@ -29278,15 +29338,16 @@ static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
                             TargetLowering::DAGCombinerInfo &DCI,
                             const X86Subtarget &Subtarget) {
  SDLoc DL(N);
-  SDValue Chain = N->getOperand(0);
-  SDValue Dest = N->getOperand(1);
  SDValue EFLAGS = N->getOperand(3);
  X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));

-  if (SDValue Flags = checkBoolTestSetCCCombine(EFLAGS, CC)) {
+  // Try to simplify the EFLAGS and condition code operands.
+  // Make sure to not keep references to operands, as combineSetCCEFLAGS can
+  // RAUW them under us.
+  if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
    SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
-    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
-                       Flags);
+    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
+                       N->getOperand(1), Cond, Flags);
  }

  return SDValue();
--- a/test/CodeGen/X86/atomic-eflags-reuse.ll
+++ b/test/CodeGen/X86/atomic-eflags-reuse.ll
@ -4,9 +4,7 @@
 define i8 @test_add_1_setcc_ne(i64* %p) #0 {
 ; CHECK-LABEL: test_add_1_setcc_ne:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    lock xaddq %rax, (%rdi)
-; CHECK-NEXT:    testq %rax, %rax
+; CHECK-NEXT:    lock incq (%rdi)
 ; CHECK-NEXT:    setne %al
 ; CHECK-NEXT:    retq
 entry:
@ -19,9 +17,7 @@ entry:
 define i8 @test_sub_1_setcc_eq(i64* %p) #0 {
 ; CHECK-LABEL: test_sub_1_setcc_eq:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    movq $-1, %rax
-; CHECK-NEXT:    lock xaddq %rax, (%rdi)
-; CHECK-NEXT:    testq %rax, %rax
+; CHECK-NEXT:    lock decq (%rdi)
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    retq
 entry:
@ -49,9 +45,7 @@ entry:
 define i8 @test_sub_10_setcc_sge(i64* %p) #0 {
 ; CHECK-LABEL: test_sub_10_setcc_sge:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    movq $-10, %rax
-; CHECK-NEXT:    lock xaddq %rax, (%rdi)
-; CHECK-NEXT:    testq %rax, %rax
+; CHECK-NEXT:    lock addq $-10, (%rdi)
 ; CHECK-NEXT:    setns %al
 ; CHECK-NEXT:    retq
 entry:
@ -66,9 +60,7 @@ entry:
 define i32 @test_add_10_brcond_sge(i64* %p, i32 %a0, i32 %a1) #0 {
 ; CHECK-LABEL: test_add_10_brcond_sge:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    movl $10, %eax
-; CHECK-NEXT:    lock xaddq %rax, (%rdi)
-; CHECK-NEXT:    testq %rax, %rax
+; CHECK-NEXT:    lock addq $10, (%rdi)
 ; CHECK-NEXT:    js .LBB4_2
 ; CHECK-NEXT:  # BB#1: # %t
 ; CHECK-NEXT:    movl %esi, %eax
@ -89,9 +81,7 @@ f:
 define i32 @test_sub_1_cmov_slt(i64* %p, i32 %a0, i32 %a1) #0 {
 ; CHECK-LABEL: test_sub_1_cmov_slt:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    movq $-1, %rax
-; CHECK-NEXT:    lock xaddq %rax, (%rdi)
-; CHECK-NEXT:    testq %rax, %rax
+; CHECK-NEXT:    lock decq (%rdi)
 ; CHECK-NEXT:    cmovnsl %edx, %esi
 ; CHECK-NEXT:    movl %esi, %eax
 ; CHECK-NEXT:    retq