[Reassociate] swap binop operands to increase factoring potential

If we have a pair of binops feeding another pair of binops, rearrange the operands so the matching pair are together because that allows easy factorization folds to happen in instcombine: ((X << S) & Y) & (Z << S) --> ((X << S) & (Z << S)) & Y (reassociation) --> ((X & Z) << S) & Y (factorize shift from 'and' ops optimization) This is part of solving PR37098: https://bugs.llvm.org/show_bug.cgi?id=37098 Note that there's an instcombine version of this patch attached there, but we're trying to make instcombine have less responsibility to improve compile-time efficiency. For reasons I still don't completely understand, reassociate does this kind of transform sometimes, but misses everything in my motivating cases. This patch on its own is gluing an independent cleanup chunk to the end of the existing RewriteExprTree() loop. We can build on it and do something stronger to better order the full expression tree like D40049. That might be an alternative to the proposal to add a separate reassociation pass like D41574. Differential Revision: https://reviews.llvm.org/D45842 llvm-svn: 341288
2024-11-22 18:54:02 +01:00 · 2018-09-02 14:22:54 +00:00 · 2018-09-02 14:22:54 +00:00 · bee81c359d
commit bee81c359d
parent 713057a782
3 changed files with 110 additions and 35 deletions
--- a/include/llvm/Transforms/Scalar/Reassociate.h
+++ b/include/llvm/Transforms/Scalar/Reassociate.h
@ -118,6 +118,7 @@ private:
  void OptimizeInst(Instruction *I);
  Instruction *canonicalizeNegConstExpr(Instruction *I);
  void BuildPairMap(ReversePostOrderTraversal<Function *> &RPOT);
+  void swapOperandsToMatchBinops(BinaryOperator &B);
 };

 } // end namespace llvm
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@ -63,6 +63,7 @@

 using namespace llvm;
 using namespace reassociate;
+using namespace PatternMatch;

 #define DEBUG_TYPE "reassociate"

@ -2131,6 +2132,66 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
  ReassociateExpression(BO);
 }

+/// If we have an associative pair of binops with the same opcode and 2 of the 3
+/// operands to that pair of binops are some other matching binop, rearrange the
+/// operands of the associative binops so the matching ops are paired together.
+/// This transform creates factoring opportunities by pairing opcodes.
+/// TODO: Should those factoring optimizations be handled here or InstCombine?
+/// Example:
+///   ((X << S) & Y) & (Z << S) --> ((X << S) & (Z << S)) & Y (reassociation)
+///     --> ((X & Z) << S) & Y (factorize shift from 'and' ops optimization)
+void ReassociatePass::swapOperandsToMatchBinops(BinaryOperator &B) {
+  BinaryOperator *B0, *B1;
+  if (!B.isAssociative() || !B.isCommutative() ||
+      !match(&B, m_BinOp(m_BinOp(B0), m_BinOp(B1))))
+    return;
+
+  // We have (B0 op B1) where both operands are also binops.
+  // Canonicalize a binop with the same opcode as the parent binop (B) to B0 and
+  // a binop with a different opcode to B1.
+  Instruction::BinaryOps TopOpc = B.getOpcode();
+  if (B0->getOpcode() != TopOpc)
+    std::swap(B0, B1);
+
+  // If (1) we don't have a pair of binops with the same opcode or (2) B0 and B1
+  // already have the same opcode, there is nothing to do. If the binop with the
+  // same opcode (B0) has more than one use, reassociation would result in more
+  // instructions, so bail out.
+  Instruction::BinaryOps OtherOpc = B1->getOpcode();
+  if (B0->getOpcode() != TopOpc || !B0->hasOneUse() || OtherOpc == TopOpc)
+    return;
+
+  // Canonicalize a binop that matches B1 to V00 (operand 0 of B0) and a value
+  // that does not match B1 to V01.
+  Value *V00 = B0->getOperand(0), *V01 = B0->getOperand(1);
+  if (!match(V00, m_BinOp()) ||
+      cast<BinaryOperator>(V00)->getOpcode() != OtherOpc)
+    std::swap(V00, V01);
+
+  // We need a binop with the same opcode in V00, and a value with a different
+  // opcode in V01.
+  BinaryOperator *B00, *B01;
+  if (!match(V00, m_BinOp(B00)) || B00->getOpcode() != OtherOpc ||
+      (match(V01, m_BinOp(B01)) && B01->getOpcode() == OtherOpc))
+    return;
+
+  // B00 and B1 are displaced matching binops, so pull them together:
+  // (B00 & V01) & B1  --> (B00 & B1) & V01
+  IRBuilder<> Builder(&B);
+  Builder.SetInstDebugLocation(&B);
+  Value *NewBO1 = Builder.CreateBinOp(TopOpc, B00, B1);
+  Value *NewBO2 = Builder.CreateBinOp(TopOpc, NewBO1, V01);
+
+  // Fast-math-flags propagate from B; wrapping flags are cleared.
+  if (auto *I1 = dyn_cast<Instruction>(NewBO1))
+    I1->copyIRFlags(&B, false);
+  if (auto *I2 = dyn_cast<Instruction>(NewBO2))
+    I2->copyIRFlags(&B, false);
+
+  B.replaceAllUsesWith(NewBO2);
+  return;
+}
+
 void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
  // First, walk the expression tree, linearizing the tree, collecting the
  // operand information.
@ -2250,6 +2311,9 @@ void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
  // Now that we ordered and optimized the expressions, splat them back into
  // the expression tree, removing any unneeded nodes.
  RewriteExprTree(I, Ops);
+
+  // Try a final reassociation of the root of the tree.
+  swapOperandsToMatchBinops(*I);
 }

 void
--- a/test/Transforms/Reassociate/matching-binops.ll
+++ b/test/Transforms/Reassociate/matching-binops.ll
@ -16,8 +16,8 @@ define i8 @and_shl(i8 %x, i8 %y, i8 %z, i8 %shamt) {
 ; CHECK-LABEL: @and_shl(
 ; CHECK-NEXT:    [[SX:%.*]] = shl i8 [[X:%.*]], [[SHAMT:%.*]]
 ; CHECK-NEXT:    [[SY:%.*]] = shl i8 [[Y:%.*]], [[SHAMT]]
-; CHECK-NEXT:    [[A:%.*]] = and i8 [[SX]], [[Z:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[A]], [[SY]]
+; CHECK-NEXT:    [[A:%.*]] = and i8 [[SX]], [[SY]]
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[A]], [[Z:%.*]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
  %sx = shl i8 %x, %shamt
@ -31,8 +31,8 @@ define i8 @or_shl(i8 %x, i8 %y, i8 %z, i8 %shamt) {
 ; CHECK-LABEL: @or_shl(
 ; CHECK-NEXT:    [[SX:%.*]] = shl i8 [[X:%.*]], [[SHAMT:%.*]]
 ; CHECK-NEXT:    [[SY:%.*]] = shl i8 [[Y:%.*]], [[SHAMT]]
-; CHECK-NEXT:    [[A:%.*]] = or i8 [[SX]], [[Z:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = or i8 [[A]], [[SY]]
+; CHECK-NEXT:    [[A:%.*]] = or i8 [[SX]], [[SY]]
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[A]], [[Z:%.*]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
  %sx = shl i8 %x, %shamt
@ -46,8 +46,8 @@ define i8 @xor_shl(i8 %x, i8 %y, i8 %z, i8 %shamt) {
 ; CHECK-LABEL: @xor_shl(
 ; CHECK-NEXT:    [[SX:%.*]] = shl i8 [[X:%.*]], [[SHAMT:%.*]]
 ; CHECK-NEXT:    [[SY:%.*]] = shl i8 [[Y:%.*]], [[SHAMT]]
-; CHECK-NEXT:    [[A:%.*]] = xor i8 [[SX]], [[Z:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = xor i8 [[A]], [[SY]]
+; CHECK-NEXT:    [[A:%.*]] = xor i8 [[SX]], [[SY]]
+; CHECK-NEXT:    [[R:%.*]] = xor i8 [[A]], [[Z:%.*]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
  %sx = shl i8 %x, %shamt
@ -61,8 +61,8 @@ define i8 @and_lshr(i8 %x, i8 %y, i8 %z, i8 %shamt) {
 ; CHECK-LABEL: @and_lshr(
 ; CHECK-NEXT:    [[SX:%.*]] = lshr i8 [[X:%.*]], [[SHAMT:%.*]]
 ; CHECK-NEXT:    [[SY:%.*]] = lshr i8 [[Y:%.*]], [[SHAMT]]
-; CHECK-NEXT:    [[A:%.*]] = and i8 [[SX]], [[Z:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[A]], [[SY]]
+; CHECK-NEXT:    [[A:%.*]] = and i8 [[SX]], [[SY]]
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[A]], [[Z:%.*]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
  %sx = lshr i8 %x, %shamt
@ -76,8 +76,8 @@ define i8 @or_lshr(i8 %x, i8 %y, i8 %z, i8 %shamt) {
 ; CHECK-LABEL: @or_lshr(
 ; CHECK-NEXT:    [[SX:%.*]] = lshr i8 [[X:%.*]], [[SHAMT:%.*]]
 ; CHECK-NEXT:    [[SY:%.*]] = lshr i8 [[Y:%.*]], [[SHAMT]]
-; CHECK-NEXT:    [[A:%.*]] = or i8 [[SX]], [[Z:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = or i8 [[A]], [[SY]]
+; CHECK-NEXT:    [[A:%.*]] = or i8 [[SX]], [[SY]]
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[A]], [[Z:%.*]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
  %sx = lshr i8 %x, %shamt
@ -91,8 +91,8 @@ define i8 @xor_lshr(i8 %x, i8 %y, i8 %z, i8 %shamt) {
 ; CHECK-LABEL: @xor_lshr(
 ; CHECK-NEXT:    [[SX:%.*]] = lshr i8 [[X:%.*]], [[SHAMT:%.*]]
 ; CHECK-NEXT:    [[SY:%.*]] = lshr i8 [[Y:%.*]], [[SHAMT]]
-; CHECK-NEXT:    [[A:%.*]] = xor i8 [[SX]], [[Z:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = xor i8 [[A]], [[SY]]
+; CHECK-NEXT:    [[A:%.*]] = xor i8 [[SX]], [[SY]]
+; CHECK-NEXT:    [[R:%.*]] = xor i8 [[A]], [[Z:%.*]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
  %sx = lshr i8 %x, %shamt
@ -106,8 +106,8 @@ define i8 @and_ashr(i8 %x, i8 %y, i8 %z, i8 %shamt) {
 ; CHECK-LABEL: @and_ashr(
 ; CHECK-NEXT:    [[SX:%.*]] = ashr i8 [[X:%.*]], [[SHAMT:%.*]]
 ; CHECK-NEXT:    [[SY:%.*]] = ashr i8 [[Y:%.*]], [[SHAMT]]
-; CHECK-NEXT:    [[A:%.*]] = and i8 [[SX]], [[Z:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = and i8 [[A]], [[SY]]
+; CHECK-NEXT:    [[A:%.*]] = and i8 [[SX]], [[SY]]
+; CHECK-NEXT:    [[R:%.*]] = and i8 [[A]], [[Z:%.*]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
  %sx = ashr i8 %x, %shamt
@ -121,8 +121,8 @@ define i8 @or_ashr(i8 %x, i8 %y, i8 %z, i8 %shamt) {
 ; CHECK-LABEL: @or_ashr(
 ; CHECK-NEXT:    [[SX:%.*]] = ashr i8 [[X:%.*]], [[SHAMT:%.*]]
 ; CHECK-NEXT:    [[SY:%.*]] = ashr i8 [[Y:%.*]], [[SHAMT]]
-; CHECK-NEXT:    [[A:%.*]] = or i8 [[SX]], [[Z:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = or i8 [[A]], [[SY]]
+; CHECK-NEXT:    [[A:%.*]] = or i8 [[SX]], [[SY]]
+; CHECK-NEXT:    [[R:%.*]] = or i8 [[A]], [[Z:%.*]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
  %sx = ashr i8 %x, %shamt
@ -138,8 +138,8 @@ define <2 x i8> @xor_ashr(<2 x i8> %x, <2 x i8> %y, <2 x i8> %z, <2 x i8> %shamt
 ; CHECK-LABEL: @xor_ashr(
 ; CHECK-NEXT:    [[SX:%.*]] = ashr <2 x i8> [[X:%.*]], [[SHAMT:%.*]]
 ; CHECK-NEXT:    [[SY:%.*]] = ashr <2 x i8> [[Y:%.*]], [[SHAMT]]
-; CHECK-NEXT:    [[A:%.*]] = xor <2 x i8> [[SX]], [[Z:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = xor <2 x i8> [[A]], [[SY]]
+; CHECK-NEXT:    [[A:%.*]] = xor <2 x i8> [[SX]], [[SY]]
+; CHECK-NEXT:    [[R:%.*]] = xor <2 x i8> [[A]], [[Z:%.*]]
 ; CHECK-NEXT:    ret <2 x i8> [[R]]
 ;
  %sx = ashr <2 x i8> %x, %shamt
@ -203,19 +203,29 @@ define i8 @xor_lshr_multiuse(i8 %x, i8 %y, i8 %z, i8 %shamt) {
 }

 ; Math ops work too. Change instruction positions too to verify placement.
+; We only care about extra uses of the first associative value - in this
+; case, it's %a. Everything else can have extra uses.
+
+declare void @use(i8)

 define i8 @add_lshr(i8 %x, i8 %y, i8 %z, i8 %shamt) {
 ; CHECK-LABEL: @add_lshr(
 ; CHECK-NEXT:    [[SX:%.*]] = lshr i8 [[X:%.*]], [[SHAMT:%.*]]
-; CHECK-NEXT:    [[A:%.*]] = add i8 [[SX]], [[Z:%.*]]
 ; CHECK-NEXT:    [[SY:%.*]] = lshr i8 [[Y:%.*]], [[SHAMT]]
-; CHECK-NEXT:    [[R:%.*]] = add i8 [[A]], [[SY]]
-; CHECK-NEXT:    ret i8 [[R]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[SX]], [[SY]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[TMP1]], [[Z:%.*]]
+; CHECK-NEXT:    call void @use(i8 [[SX]])
+; CHECK-NEXT:    call void @use(i8 [[SY]])
+; CHECK-NEXT:    call void @use(i8 [[TMP2]])
+; CHECK-NEXT:    ret i8 [[TMP2]]
 ;
  %sx = lshr i8 %x, %shamt
  %a = add i8 %sx, %z
  %sy = lshr i8 %y, %shamt
  %r = add i8 %a, %sy
+  call void @use(i8 %sx)
+  call void @use(i8 %sy)
+  call void @use(i8 %r)
  ret i8 %r
 }

@ -225,8 +235,8 @@ define i8 @mul_sub(i8 %x, i8 %y, i8 %z, i8 %m) {
 ; CHECK-LABEL: @mul_sub(
 ; CHECK-NEXT:    [[SX:%.*]] = sub i8 [[X:%.*]], [[M:%.*]]
 ; CHECK-NEXT:    [[SY:%.*]] = sub i8 [[Y:%.*]], [[M]]
-; CHECK-NEXT:    [[A:%.*]] = mul nsw i8 [[SX]], [[Z:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = mul nuw i8 [[A]], [[SY]]
+; CHECK-NEXT:    [[A:%.*]] = mul i8 [[SX]], [[SY]]
+; CHECK-NEXT:    [[R:%.*]] = mul i8 [[A]], [[Z:%.*]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
  %sx = sub i8 %x, %m
@ -239,9 +249,9 @@ define i8 @mul_sub(i8 %x, i8 %y, i8 %z, i8 %m) {
 define i8 @add_mul(i8 %x, i8 %y, i8 %z, i8 %m) {
 ; CHECK-LABEL: @add_mul(
 ; CHECK-NEXT:    [[SX:%.*]] = mul nuw i8 [[X:%.*]], 42
-; CHECK-NEXT:    [[A:%.*]] = add nuw i8 [[Z:%.*]], [[SX]]
 ; CHECK-NEXT:    [[SY:%.*]] = mul nsw i8 [[M:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = add nsw i8 [[A]], [[SY]]
+; CHECK-NEXT:    [[A:%.*]] = add i8 [[SX]], [[SY]]
+; CHECK-NEXT:    [[R:%.*]] = add i8 [[A]], [[Z:%.*]]
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
  %sx = mul nuw i8 %x, 42
@ -257,9 +267,9 @@ define i8 @add_mul(i8 %x, i8 %y, i8 %z, i8 %m) {
 define float @fadd_fmul(float %x, float %y, float %z, float %m) {
 ; CHECK-LABEL: @fadd_fmul(
 ; CHECK-NEXT:    [[SX:%.*]] = fmul float [[X:%.*]], [[M:%.*]]
-; CHECK-NEXT:    [[A:%.*]] = fadd fast float [[SX]], [[Z:%.*]]
 ; CHECK-NEXT:    [[SY:%.*]] = fmul float [[Y:%.*]], [[M]]
-; CHECK-NEXT:    [[R:%.*]] = fadd fast float [[A]], [[SY]]
+; CHECK-NEXT:    [[A:%.*]] = fadd fast float [[SX]], [[SY]]
+; CHECK-NEXT:    [[R:%.*]] = fadd fast float [[A]], [[Z:%.*]]
 ; CHECK-NEXT:    ret float [[R]]
 ;
  %sx = fmul float %x, %m
@ -273,8 +283,8 @@ define float @fmul_fdiv(float %x, float %y, float %z, float %m) {
 ; CHECK-LABEL: @fmul_fdiv(
 ; CHECK-NEXT:    [[SX:%.*]] = fdiv float [[X:%.*]], [[M:%.*]]
 ; CHECK-NEXT:    [[SY:%.*]] = fdiv float [[Y:%.*]], 4.200000e+01
-; CHECK-NEXT:    [[A:%.*]] = fmul fast float [[SY]], [[Z:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = fmul fast float [[A]], [[SX]]
+; CHECK-NEXT:    [[A:%.*]] = fmul fast float [[SY]], [[SX]]
+; CHECK-NEXT:    [[R:%.*]] = fmul fast float [[A]], [[Z:%.*]]
 ; CHECK-NEXT:    ret float [[R]]
 ;
  %sx = fdiv float %x, %m
@ -284,7 +294,7 @@ define float @fmul_fdiv(float %x, float %y, float %z, float %m) {
  ret float %r
 }

-; Verify that debug info for modified instructions gets discarded (references become undef).
+; Verify that debug info for modified instructions is not invalid.

 define i32 @and_shl_dbg(i32 %x, i32 %y, i32 %z, i32 %shamt) {
 ; CHECK-LABEL: @and_shl_dbg(
@ -296,11 +306,11 @@ define i32 @and_shl_dbg(i32 %x, i32 %y, i32 %z, i32 %shamt) {
 ; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[SHL]], metadata !16, metadata !DIExpression()), !dbg !25
 ; CHECK-NEXT:    [[SHL1:%.*]] = shl i32 [[Y]], [[SHAMT]], !dbg !26
 ; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[SHL1]], metadata !17, metadata !DIExpression()), !dbg !27
-; CHECK-NEXT:    [[AND:%.*]] = and i32 [[SHL]], [[Z]], !dbg !28
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[AND]], metadata !18, metadata !DIExpression()), !dbg !29
-; CHECK-NEXT:    [[AND2:%.*]] = and i32 [[AND]], [[SHL1]], !dbg !30
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[AND2]], metadata !19, metadata !DIExpression()), !dbg !31
-; CHECK-NEXT:    ret i32 [[AND2]], !dbg !32
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata !2, metadata !18, metadata !DIExpression()), !dbg !28
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[SHL]], [[SHL1]], !dbg !29
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], [[Z]], !dbg !29
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata i32 [[TMP2]], metadata !19, metadata !DIExpression()), !dbg !30
+; CHECK-NEXT:    ret i32 [[TMP2]], !dbg !31
 ;
  call void @llvm.dbg.value(metadata i32 %x, metadata !13, metadata !DIExpression()), !dbg !21
  call void @llvm.dbg.value(metadata i32 %y, metadata !14, metadata !DIExpression()), !dbg !22