[InstCombine] reassociate diff of sums into sum of diffs

This is the integer sibling to D81491. (a[0] + a[1] + a[2] + a[3]) - (b[0] + b[1] + b[2] +b[3]) --> (a[0] - b[0]) + (a[1] - b[1]) + (a[2] - b[2]) + (a[3] - b[3]) Removing the "experimental" from these intrinsics is likely not too far away.
2025-01-31 20:51:52 +01:00 · 2020-06-22 20:45:00 -04:00 · 2020-06-22 20:45:00 -04:00 · d6c0ccc19e
commit d6c0ccc19e
parent 1e920f102e
3 changed files with 27 additions and 8 deletions
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@ -1787,6 +1787,21 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
    return BinaryOperator::CreateSub(XZ, YW);
  }

+  auto m_AddRdx = [](Value *&Vec) {
+    return m_OneUse(
+        m_Intrinsic<Intrinsic::experimental_vector_reduce_add>(m_Value(Vec)));
+  };
+  Value *V0, *V1;
+  if (match(Op0, m_AddRdx(V0)) && match(Op1, m_AddRdx(V1)) &&
+      V0->getType() == V1->getType()) {
+    // Difference of sums is sum of differences:
+    // add_rdx(V0) - add_rdx(V1) --> add_rdx(V0 - V1)
+    Value *Sub = Builder.CreateSub(V0, V1);
+    Value *Rdx = Builder.CreateIntrinsic(
+        Intrinsic::experimental_vector_reduce_add, {Sub->getType()}, {Sub});
+    return replaceInstUsesWith(I, Rdx);
+  }
+
  if (Constant *C = dyn_cast<Constant>(Op0)) {
    Value *X;
    if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
--- a/test/Transforms/InstCombine/vector-reductions.ll
+++ b/test/Transforms/InstCombine/vector-reductions.ll
@ -88,10 +88,9 @@ define float @diff_of_sums_type_mismatch(float %a0, <4 x float> %v0, float %a1,

 define i32 @diff_of_sums_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @diff_of_sums_v4i32(
-; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[V0:%.*]])
-; CHECK-NEXT:    [[R1:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[V1:%.*]])
-; CHECK-NEXT:    [[R:%.*]] = sub i32 [[R0]], [[R1]]
-; CHECK-NEXT:    ret i32 [[R]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sub <4 x i32> [[V0:%.*]], [[V1:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
+; CHECK-NEXT:    ret i32 [[TMP2]]
 ;
  %r0 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v0)
  %r1 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %v1)
@ -99,6 +98,8 @@ define i32 @diff_of_sums_v4i32(<4 x i32> %v0, <4 x i32> %v1) {
  ret i32 %r
 }

+; negative test - extra uses could create extra instructions
+
 define i32 @diff_of_sums_v4i32_extra_use1(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @diff_of_sums_v4i32_extra_use1(
 ; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[V0:%.*]])
@ -114,6 +115,8 @@ define i32 @diff_of_sums_v4i32_extra_use1(<4 x i32> %v0, <4 x i32> %v1) {
  ret i32 %r
 }

+; negative test - extra uses could create extra instructions
+
 define i32 @diff_of_sums_v4i32_extra_use2(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @diff_of_sums_v4i32_extra_use2(
 ; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[V0:%.*]])
@ -129,6 +132,8 @@ define i32 @diff_of_sums_v4i32_extra_use2(<4 x i32> %v0, <4 x i32> %v1) {
  ret i32 %r
 }

+; negative test - can't reassociate different vector types
+
 define i32 @diff_of_sums_type_mismatch2(<8 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: @diff_of_sums_type_mismatch2(
 ; CHECK-NEXT:    [[R0:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> [[V0:%.*]])
--- a/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
+++ b/test/Transforms/PhaseOrdering/X86/vector-reductions.ll
@ -132,10 +132,9 @@ define i32 @TestVectorsEqual_alt(i32* noalias %Vec0, i32* noalias %Vec1, i32 %To
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32* [[VEC1:%.*]] to <4 x i32>*
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
-; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP1]])
-; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP3]])
-; CHECK-NEXT:    [[ADD_3:%.*]] = sub i32 [[TMP4]], [[TMP5]]
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp ule i32 [[ADD_3]], [[TOLERANCE:%.*]]
+; CHECK-NEXT:    [[TMP4:%.*]] = sub <4 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ule i32 [[TMP5]], [[TOLERANCE:%.*]]
 ; CHECK-NEXT:    [[COND:%.*]] = zext i1 [[CMP3]] to i32
 ; CHECK-NEXT:    ret i32 [[COND]]
 ;