[GlobalISel] Rewrite the elide-br-by-swapping-icmp-ops combine to do less.

This combine previously tried to take sequences like: %cond = G_ICMP pred, a, b G_BRCOND %cond, %truebb G_BR %falsebb %truebb: ... %falsebb: ... and by inverting the compare predicate and swapping branch targets, delete the G_BR and instead have a single conditional branch to the falsebb. Since in an earlier patch we have a combine to fold not(icmp) into just an inverted icmp, we don't need this combine to do as much. This patch instead generalizes the combine by just looking for: G_BRCOND %cond, %truebb G_BR %falsebb %truebb: ... %falsebb: ... and then inverting the condition using a not (xor). The xor can be folded away in a separate combine. This change also lets us avoid some optimization code in the IRTranslator. I also think that deleting G_BRs in the combiner is unnecessary. That's something that targets can decide to do at selection time and could simplify generic code in future. Differential Revision: https://reviews.llvm.org/D86664
2025-01-31 20:51:52 +01:00 · 2020-08-24 10:46:50 -07:00 · 2020-08-24 10:46:50 -07:00 · a7636dc8f8
commit a7636dc8f8
parent dc79f6327a
18 changed files with 111 additions and 86 deletions
--- a/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@ -147,9 +147,10 @@ public:
  bool matchSextInRegOfLoad(MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo);
  bool applySextInRegOfLoad(MachineInstr &MI, std::tuple<Register, unsigned> &MatchInfo);

-  bool matchElideBrByInvertingCond(MachineInstr &MI);
-  void applyElideBrByInvertingCond(MachineInstr &MI);
-  bool tryElideBrByInvertingCond(MachineInstr &MI);
+  /// If a brcond's true block is not the fallthrough, make it so by inverting
+  /// the condition and swapping operands.
+  bool matchOptBrCondByInvertingCond(MachineInstr &MI);
+  void applyOptBrCondByInvertingCond(MachineInstr &MI);

  /// If \p MI is G_CONCAT_VECTORS, try to combine it.
  /// Returns true if MI changed.
--- a/include/llvm/CodeGen/GlobalISel/Utils.h
+++ b/include/llvm/CodeGen/GlobalISel/Utils.h
@ -245,5 +245,9 @@ bool isBuildVectorAllOnes(const MachineInstr &MI,
 /// the value \p Val contains a true value.
 bool isConstTrueVal(const TargetLowering &TLI, int64_t Val, bool IsVector,
                    bool IsFP);
+
+/// Returns an integer representing true, as defined by the
+/// TargetBooleanContents.
+int64_t getICmpTrueVal(const TargetLowering &TLI, bool IsVector, bool IsFP);
 } // End namespace llvm.
 #endif
--- a/include/llvm/Target/GlobalISel/Combine.td
+++ b/include/llvm/Target/GlobalISel/Combine.td
@ -145,13 +145,11 @@ def combine_indexed_load_store : GICombineRule<
         [{ return Helper.matchCombineIndexedLoadStore(*${root}, ${matchinfo}); }]),
  (apply [{ Helper.applyCombineIndexedLoadStore(*${root}, ${matchinfo}); }])>;

-// FIXME: Is there a reason this wasn't in tryCombine? I've left it out of
-//        all_combines because it wasn't there.
-def elide_br_by_inverting_cond : GICombineRule<
+def opt_brcond_by_inverting_cond : GICombineRule<
  (defs root:$root),
  (match (wip_match_opcode G_BR):$root,
-         [{ return Helper.matchElideBrByInvertingCond(*${root}); }]),
-  (apply [{ Helper.applyElideBrByInvertingCond(*${root}); }])>;
+         [{ return Helper.matchOptBrCondByInvertingCond(*${root}); }]),
+  (apply [{ Helper.applyOptBrCondByInvertingCond(*${root}); }])>;

 def ptr_add_immed_matchdata : GIDefMatchData<"PtrAddChain">;
 def ptr_add_immed_chain : GICombineRule<
@ -416,4 +414,4 @@ def all_combines : GICombineGroup<[trivial_combines, ptr_add_immed_chain,
    shl_ashr_to_sext_inreg, sext_inreg_of_load,
    width_reduction_combines, select_combines,
    known_bits_simplifications, ext_ext_fold,
-    not_cmp_fold]>;
+    not_cmp_fold, opt_brcond_by_inverting_cond]>;
--- a/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@ -881,14 +881,12 @@ void CombinerHelper::applyCombineIndexedLoadStore(
  LLVM_DEBUG(dbgs() << "    Combinined to indexed operation");
 }

-bool CombinerHelper::matchElideBrByInvertingCond(MachineInstr &MI) {
+bool CombinerHelper::matchOptBrCondByInvertingCond(MachineInstr &MI) {
  if (MI.getOpcode() != TargetOpcode::G_BR)
    return false;

  // Try to match the following:
  // bb1:
-  //   %c(s32) = G_ICMP pred, %a, %b
-  //   %c1(s1) = G_TRUNC %c(s32)
  //   G_BRCOND %c1, %bb2
  //   G_BR %bb3
  // bb2:
@ -898,7 +896,7 @@ bool CombinerHelper::matchElideBrByInvertingCond(MachineInstr &MI) {
  // The above pattern does not have a fall through to the successor bb2, always
  // resulting in a branch no matter which path is taken. Here we try to find
  // and replace that pattern with conditional branch to bb3 and otherwise
-  // fallthrough to bb2.
+  // fallthrough to bb2. This is generally better for branch predictors.

  MachineBasicBlock *MBB = MI.getParent();
  MachineBasicBlock::iterator BrIt(MI);
@ -913,40 +911,34 @@ bool CombinerHelper::matchElideBrByInvertingCond(MachineInstr &MI) {
  // Check that the next block is the conditional branch target.
  if (!MBB->isLayoutSuccessor(BrCond->getOperand(1).getMBB()))
    return false;
-
-  MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg());
-  if (!CmpMI || CmpMI->getOpcode() != TargetOpcode::G_ICMP ||
-      !MRI.hasOneNonDBGUse(CmpMI->getOperand(0).getReg()))
-    return false;
  return true;
 }

-bool CombinerHelper::tryElideBrByInvertingCond(MachineInstr &MI) {
-  if (!matchElideBrByInvertingCond(MI))
-    return false;
-  applyElideBrByInvertingCond(MI);
-  return true;
-}
-
-void CombinerHelper::applyElideBrByInvertingCond(MachineInstr &MI) {
+void CombinerHelper::applyOptBrCondByInvertingCond(MachineInstr &MI) {
  MachineBasicBlock *BrTarget = MI.getOperand(0).getMBB();
  MachineBasicBlock::iterator BrIt(MI);
  MachineInstr *BrCond = &*std::prev(BrIt);
-  MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg());

-  CmpInst::Predicate InversePred = CmpInst::getInversePredicate(
-      (CmpInst::Predicate)CmpMI->getOperand(1).getPredicate());
+  Builder.setInstrAndDebugLoc(*BrCond);
+  LLT Ty = MRI.getType(BrCond->getOperand(0).getReg());
+  // FIXME: Does int/fp matter for this? If so, we might need to restrict
+  // this to i1 only since we might not know for sure what kind of
+  // compare generated the condition value.
+  auto True = Builder.buildConstant(
+      Ty, getICmpTrueVal(getTargetLowering(), false, false));
+  auto Xor = Builder.buildXor(Ty, BrCond->getOperand(0), True);

-  // Invert the G_ICMP condition.
-  Observer.changingInstr(*CmpMI);
-  CmpMI->getOperand(1).setPredicate(InversePred);
-  Observer.changedInstr(*CmpMI);
+  auto *FallthroughBB = BrCond->getOperand(1).getMBB();
+  Observer.changingInstr(MI);
+  MI.getOperand(0).setMBB(FallthroughBB);
+  Observer.changedInstr(MI);

-  // Change the conditional branch target.
+  // Change the conditional branch to use the inverted condition and
+  // new target block.
  Observer.changingInstr(*BrCond);
+  BrCond->getOperand(0).setReg(Xor.getReg(0));
  BrCond->getOperand(1).setMBB(BrTarget);
  Observer.changedInstr(*BrCond);
-  MI.eraseFromParent();
 }

 static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
--- a/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/lib/CodeGen/GlobalISel/Utils.cpp
@ -740,3 +740,15 @@ bool llvm::isConstTrueVal(const TargetLowering &TLI, int64_t Val, bool IsVector,
  }
  llvm_unreachable("Invalid boolean contents");
 }
+
+int64_t llvm::getICmpTrueVal(const TargetLowering &TLI, bool IsVector,
+                             bool IsFP) {
+  switch (TLI.getBooleanContents(IsVector, IsFP)) {
+  case TargetLowering::UndefinedBooleanContent:
+  case TargetLowering::ZeroOrOneBooleanContent:
+    return 1;
+  case TargetLowering::ZeroOrNegativeOneBooleanContent:
+    return -1;
+  }
+  llvm_unreachable("Invalid boolean contents");
+}
--- a/lib/Target/AArch64/AArch64Combine.td
+++ b/lib/Target/AArch64/AArch64Combine.td
@ -19,7 +19,6 @@ def fconstant_to_constant : GICombineRule<

 def AArch64PreLegalizerCombinerHelper: GICombinerHelper<
  "AArch64GenPreLegalizerCombinerHelper", [all_combines,
-                                           elide_br_by_inverting_cond,
                                           fconstant_to_constant]> {
  let DisableRuleOption = "aarch64prelegalizercombiner-disable-rule";
  let StateClass = "AArch64PreLegalizerCombinerHelperState";
--- a/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/lib/Target/AMDGPU/AMDGPUCombine.td
@ -42,8 +42,7 @@ def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;


 def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
-  "AMDGPUGenPreLegalizerCombinerHelper", [all_combines,
-                                          elide_br_by_inverting_cond]> {
+  "AMDGPUGenPreLegalizerCombinerHelper", [all_combines]> {
  let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule";
 }

--- a/test/CodeGen/AArch64/GlobalISel/const-0.ll
+++ b/test/CodeGen/AArch64/GlobalISel/const-0.ll
@ -1,25 +0,0 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -global-isel -O0 -o - %s | FileCheck %s
-
-%struct.comp = type { i8*, i32, i8*, [3 x i8], i32 }
-
-define void @regbranch() {
-; CHECK-LABEL: regbranch:
-; CHECK: mov {{w[0-9]+}}, #0
-cond_next240.i:
-  br i1 false, label %cond_true251.i, label %cond_next272.i
-
-cond_true251.i:
-  switch i8 0, label %cond_next272.i [
-      i8 42, label %bb268.i
-      i8 43, label %bb268.i
-      i8 63, label %bb268.i
-  ]
-
-bb268.i:
-  br label %cond_next272.i
-
-cond_next272.i:
-  %len.2.i = phi i32 [ 0, %bb268.i ], [ 0, %cond_next240.i ], [ 0, %cond_true251.i ]
-  %tmp278.i = icmp eq i32 %len.2.i, 1
-  ret void
-}
--- a/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-br.mir
+++ b/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-br.mir
@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -debugify-and-strip-all-safe -O0 -run-pass=aarch64-prelegalizer-combiner -global-isel -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -debugify-and-strip-all-safe -O0 -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="opt_brcond_by_inverting_cond" -global-isel -verify-machineinstrs %s -o - | FileCheck %s
 --- |
  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
  target triple = "arm64-apple-ios5.0.0"
@ -38,8 +38,11 @@ body:             |
  ; CHECK:   [[COPY1:%[0-9]+]]:_(s32) = COPY $w1
  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
  ; CHECK:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-  ; CHECK:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[COPY]](s32), [[C]]
-  ; CHECK:   G_BRCOND [[ICMP]](s1), %bb.2
+  ; CHECK:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY]](s32), [[C]]
+  ; CHECK:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; CHECK:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[C2]]
+  ; CHECK:   G_BRCOND [[XOR]](s1), %bb.2
+  ; CHECK:   G_BR %bb.1
  ; CHECK: bb.1.if.then:
  ; CHECK:   successors: %bb.3(0x80000000)
  ; CHECK:   [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[COPY1]], [[COPY]]
--- a/test/CodeGen/AArch64/GlobalISel/select-constant.mir
+++ b/test/CodeGen/AArch64/GlobalISel/select-constant.mir
@ -8,6 +8,8 @@
  define i16 @const_s16() { ret i16 42 }
  define i32 @const_s32() { ret i32 42 }
  define i64 @const_s64() { ret i64 1234567890123 }
+  define i32 @const_s32_zero() { ret i32 0 }
+  define i64 @const_s64_zero() { ret i64 0 }
  define i8* @const_p0_0() { ret i8* null }

  define i32 @fconst_s32() { ret i32 42 }
@ -81,6 +83,38 @@ body:             |
    $x0 = COPY %0(s64)
 ...

+---
+name:            const_s32_zero
+legalized:       true
+regBankSelected: true
+registers:
+  - { id: 0, class: gpr }
+
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: const_s32_zero
+    ; CHECK: [[COPY:%[0-9]+]]:gpr32 = COPY $wzr
+    ; CHECK: $w0 = COPY [[COPY]]
+    %0(s32) = G_CONSTANT i32 0
+    $w0 = COPY %0(s32)
+...
+
+---
+name:            const_s64_zero
+legalized:       true
+regBankSelected: true
+registers:
+  - { id: 0, class: gpr }
+
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: const_s64_zero
+    ; CHECK: [[COPY:%[0-9]+]]:gpr64 = COPY $xzr
+    ; CHECK: $x0 = COPY [[COPY]]
+    %0(s64) = G_CONSTANT i64 0
+    $x0 = COPY %0(s64)
+...
+
 ---
 name:            const_p0_0
 legalized:       true
--- a/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll
@ -52,9 +52,10 @@ define amdgpu_kernel void @sgpr_trunc_brcond(i32 %cond) {
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_load_dword s0, s[0:1], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_xor_b32 s0, s0, -1
 ; GCN-NEXT:    s_and_b32 s0, s0, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cbranch_scc0 BB3_2
+; GCN-NEXT:    s_cbranch_scc1 BB3_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    flat_store_dword v[0:1], v0
@ -80,9 +81,10 @@ define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) {
 ; GCN-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_and_b32 s0, s0, s1
+; GCN-NEXT:    s_xor_b32 s0, s0, -1
 ; GCN-NEXT:    s_and_b32 s0, s0, 1
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cbranch_scc0 BB4_2
+; GCN-NEXT:    s_cbranch_scc1 BB4_2
 ; GCN-NEXT:  ; %bb.1: ; %bb0
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    flat_store_dword v[0:1], v0
--- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll
@ -51,11 +51,11 @@ define amdgpu_kernel void @is_private_sgpr(i8* %ptr) {
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s0, s[4:5], 0x11
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_cmp_eq_u32 s1, s0
+; CI-NEXT:    s_cmp_lg_u32 s1, s0
 ; CI-NEXT:    s_cselect_b32 s0, 1, 0
 ; CI-NEXT:    s_and_b32 s0, s0, 1
 ; CI-NEXT:    s_cmp_lg_u32 s0, 0
-; CI-NEXT:    s_cbranch_scc0 BB1_2
+; CI-NEXT:    s_cbranch_scc1 BB1_2
 ; CI-NEXT:  ; %bb.1: ; %bb0
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    flat_store_dword v[0:1], v0
@ -68,11 +68,11 @@ define amdgpu_kernel void @is_private_sgpr(i8* %ptr) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 0, 16)
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX9-NEXT:    s_cmp_eq_u32 s1, s0
+; GFX9-NEXT:    s_cmp_lg_u32 s1, s0
 ; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    s_and_b32 s0, s0, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX9-NEXT:    s_cbranch_scc0 BB1_2
+; GFX9-NEXT:    s_cbranch_scc1 BB1_2
 ; GFX9-NEXT:  ; %bb.1: ; %bb0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
--- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll
@ -51,11 +51,11 @@ define amdgpu_kernel void @is_local_sgpr(i8* %ptr) {
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    s_load_dword s0, s[4:5], 0x10
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
-; CI-NEXT:    s_cmp_eq_u32 s1, s0
+; CI-NEXT:    s_cmp_lg_u32 s1, s0
 ; CI-NEXT:    s_cselect_b32 s0, 1, 0
 ; CI-NEXT:    s_and_b32 s0, s0, 1
 ; CI-NEXT:    s_cmp_lg_u32 s0, 0
-; CI-NEXT:    s_cbranch_scc0 BB1_2
+; CI-NEXT:    s_cbranch_scc1 BB1_2
 ; CI-NEXT:  ; %bb.1: ; %bb0
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    flat_store_dword v[0:1], v0
@ -68,11 +68,11 @@ define amdgpu_kernel void @is_local_sgpr(i8* %ptr) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_getreg_b32 s0, hwreg(HW_REG_SH_MEM_BASES, 16, 16)
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX9-NEXT:    s_cmp_eq_u32 s1, s0
+; GFX9-NEXT:    s_cmp_lg_u32 s1, s0
 ; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    s_and_b32 s0, s0, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX9-NEXT:    s_cbranch_scc0 BB1_2
+; GFX9-NEXT:    s_cbranch_scc1 BB1_2
 ; GFX9-NEXT:  ; %bb.1: ; %bb0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
--- a/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/localizer.ll
@ -29,9 +29,10 @@ define amdgpu_kernel void @localize_constants(i1 %cond) {
 ; GFX9-NEXT:    s_mov_b32 s0, 0
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX9-NEXT:  BB0_2: ; %Flow
+; GFX9-NEXT:    s_xor_b32 s0, s0, -1
 ; GFX9-NEXT:    s_and_b32 s0, s0, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX9-NEXT:    s_cbranch_scc0 BB0_4
+; GFX9-NEXT:    s_cbranch_scc1 BB0_4
 ; GFX9-NEXT:  ; %bb.3: ; %bb0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x7b
 ; GFX9-NEXT:    global_store_dword v[0:1], v0, off
@ -109,9 +110,10 @@ define amdgpu_kernel void @localize_globals(i1 %cond) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:  BB1_2: ; %Flow
+; GFX9-NEXT:    s_xor_b32 s0, s0, -1
 ; GFX9-NEXT:    s_and_b32 s0, s0, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX9-NEXT:    s_cbranch_scc0 BB1_4
+; GFX9-NEXT:    s_cbranch_scc1 BB1_4
 ; GFX9-NEXT:  ; %bb.3: ; %bb0
 ; GFX9-NEXT:    s_getpc_b64 s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s0, gv0@gotpcrel32@lo+4
--- a/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@ -357,9 +357,10 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:  BB1_2:
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:  BB1_3: ; %Flow
-; CHECK-NEXT:    s_and_b32 s0, s1, 1
+; CHECK-NEXT:    s_xor_b32 s0, s1, -1
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
-; CHECK-NEXT:    s_cbranch_scc0 BB1_5
+; CHECK-NEXT:    s_cbranch_scc1 BB1_5
 ; CHECK-NEXT:  ; %bb.4:
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; CHECK-NEXT:    s_sub_i32 s0, 0, s4
--- a/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@ -351,9 +351,10 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:  BB1_2:
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:  BB1_3: ; %Flow
-; CHECK-NEXT:    s_and_b32 s0, s1, 1
+; CHECK-NEXT:    s_xor_b32 s0, s1, -1
+; CHECK-NEXT:    s_and_b32 s0, s0, 1
 ; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
-; CHECK-NEXT:    s_cbranch_scc0 BB1_5
+; CHECK-NEXT:    s_cbranch_scc1 BB1_5
 ; CHECK-NEXT:  ; %bb.4:
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; CHECK-NEXT:    s_sub_i32 s0, 0, s4
--- a/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@ -323,9 +323,10 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:  BB1_2:
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:  BB1_3: ; %Flow
-; CHECK-NEXT:    s_and_b32 s1, s5, 1
+; CHECK-NEXT:    s_xor_b32 s1, s5, -1
+; CHECK-NEXT:    s_and_b32 s1, s1, 1
 ; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
-; CHECK-NEXT:    s_cbranch_scc0 BB1_5
+; CHECK-NEXT:    s_cbranch_scc1 BB1_5
 ; CHECK-NEXT:  ; %bb.4:
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; CHECK-NEXT:    s_sub_i32 s1, 0, s2
--- a/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@ -319,9 +319,10 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:  BB1_2:
 ; CHECK-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CHECK-NEXT:  BB1_3: ; %Flow
-; CHECK-NEXT:    s_and_b32 s1, s5, 1
+; CHECK-NEXT:    s_xor_b32 s1, s5, -1
+; CHECK-NEXT:    s_and_b32 s1, s1, 1
 ; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
-; CHECK-NEXT:    s_cbranch_scc0 BB1_5
+; CHECK-NEXT:    s_cbranch_scc1 BB1_5
 ; CHECK-NEXT:  ; %bb.4:
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; CHECK-NEXT:    s_sub_i32 s1, 0, s2