[InstCombine] fold reassociative FP add into start value of fadd reduction

This pattern is visible in unrolled and vectorized loops. Although the backend seems to be able to reassociate to ideal form in the examples I looked at, we might as well do that in IR for efficiency.
2025-01-31 20:51:52 +01:00 · 2021-07-17 18:36:36 -04:00 · 2021-07-17 18:36:36 -04:00 · 8921c5a885
commit 8921c5a885
parent 1f0abe2070
3 changed files with 44 additions and 21 deletions
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@ -1622,6 +1622,27 @@ Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) {
  if (I.hasAllowReassoc() && I.hasNoSignedZeros()) {
    if (Instruction *F = factorizeFAddFSub(I, Builder))
      return F;
+
+    // Try to fold fadd into start value of reduction intrinsic.
+    if (match(&I, m_c_FAdd(m_OneUse(m_Intrinsic<Intrinsic::vector_reduce_fadd>(
+                               m_AnyZeroFP(), m_Value(X))),
+                           m_Value(Y)))) {
+      // fadd (rdx 0.0, X), Y --> rdx Y, X
+      return replaceInstUsesWith(
+          I, Builder.CreateIntrinsic(Intrinsic::vector_reduce_fadd,
+                                     {X->getType()}, {Y, X}, &I));
+    }
+    const APFloat *StartC, *C;
+    if (match(LHS, m_OneUse(m_Intrinsic<Intrinsic::vector_reduce_fadd>(
+                       m_APFloat(StartC), m_Value(X)))) &&
+        match(RHS, m_APFloat(C))) {
+      // fadd (rdx StartC, X), C --> rdx (C + StartC), X
+      Constant *NewStartC = ConstantFP::get(I.getType(), *C + *StartC);
+      return replaceInstUsesWith(
+          I, Builder.CreateIntrinsic(Intrinsic::vector_reduce_fadd,
+                                     {X->getType()}, {NewStartC, X}, &I));
+    }
+
    if (Value *V = FAddCombine(Builder).simplify(&I))
      return replaceInstUsesWith(I, V);
  }
--- a/test/Transforms/InstCombine/fadd.ll
+++ b/test/Transforms/InstCombine/fadd.ll
@ -391,9 +391,8 @@ define float @fmul_fneg2_extra_use3(float %x, float %py, float %z) {

 define float @fadd_rdx(float %x, <4 x float> %v) {
 ; CHECK-LABEL: @fadd_rdx(
-; CHECK-NEXT:    [[RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[V:%.*]])
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[RDX]], [[X:%.*]]
-; CHECK-NEXT:    ret float [[ADD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[X:%.*]], <4 x float> [[V:%.*]])
+; CHECK-NEXT:    ret float [[TMP1]]
 ;
  %rdx = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> %v)
  %add = fadd fast float %rdx, %x
@ -403,9 +402,8 @@ define float @fadd_rdx(float %x, <4 x float> %v) {
 define float @fadd_rdx_commute(float %x, <4 x float> %v) {
 ; CHECK-LABEL: @fadd_rdx_commute(
 ; CHECK-NEXT:    [[D:%.*]] = fdiv float 4.200000e+01, [[X:%.*]]
-; CHECK-NEXT:    [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[V:%.*]])
-; CHECK-NEXT:    [[ADD:%.*]] = fadd reassoc nsz float [[D]], [[RDX]]
-; CHECK-NEXT:    ret float [[ADD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc nsz float @llvm.vector.reduce.fadd.v4f32(float [[D]], <4 x float> [[V:%.*]])
+; CHECK-NEXT:    ret float [[TMP1]]
 ;
  %d = fdiv float 42.0, %x
  %rdx = call float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %v)
@ -413,6 +411,8 @@ define float @fadd_rdx_commute(float %x, <4 x float> %v) {
  ret float %add
 }

+; Negative test - require nsz to be safer (and reassoc obviously).
+
 define float @fadd_rdx_fmf(float %x, <4 x float> %v) {
 ; CHECK-LABEL: @fadd_rdx_fmf(
 ; CHECK-NEXT:    [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[V:%.*]])
@ -424,6 +424,8 @@ define float @fadd_rdx_fmf(float %x, <4 x float> %v) {
  ret float %add
 }

+; Negative test - don't replace a single add with another reduction.
+
 define float @fadd_rdx_extra_use(float %x, <4 x float> %v) {
 ; CHECK-LABEL: @fadd_rdx_extra_use(
 ; CHECK-NEXT:    [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[V:%.*]])
@ -439,15 +441,16 @@ define float @fadd_rdx_extra_use(float %x, <4 x float> %v) {

 define float @fadd_rdx_nonzero_start_const_op(<4 x float> %v) {
 ; CHECK-LABEL: @fadd_rdx_nonzero_start_const_op(
-; CHECK-NEXT:    [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float 4.200000e+01, <4 x float> [[V:%.*]])
-; CHECK-NEXT:    [[ADD:%.*]] = fadd reassoc ninf nsz float [[RDX]], -9.000000e+00
-; CHECK-NEXT:    ret float [[ADD]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call reassoc ninf nsz float @llvm.vector.reduce.fadd.v4f32(float 3.300000e+01, <4 x float> [[V:%.*]])
+; CHECK-NEXT:    ret float [[TMP1]]
 ;
  %rdx = call float @llvm.vector.reduce.fadd.v4f32(float 42.0, <4 x float> %v)
  %add = fadd reassoc nsz ninf float %rdx, -9.0
  ret float %add
 }

+; Negative test - we don't change the order of ops unless it saves an instruction.
+
 define float @fadd_rdx_nonzero_start_variable_op(float %x, <4 x float> %v) {
 ; CHECK-LABEL: @fadd_rdx_nonzero_start_variable_op(
 ; CHECK-NEXT:    [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float 4.200000e+01, <4 x float> [[V:%.*]])
--- a/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
+++ b/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
@ -12,7 +12,7 @@ define i32 @add_v4i32(i32* %p) #0 {
 ; CHECK-LABEL: @add_v4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, [[TBAA0:!tbaa !.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !tbaa [[TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP1]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@ -51,7 +51,7 @@ define signext i16 @mul_v8i16(i16* %p) #0 {
 ; CHECK-LABEL: @mul_v8i16(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[P:%.*]] to <8 x i16>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2, [[TBAA4:!tbaa !.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2, !tbaa [[TBAA4:![0-9]+]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = mul <8 x i16> [[TMP1]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i16> [[BIN_RDX]], <8 x i16> poison, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@ -95,7 +95,7 @@ define signext i8 @or_v16i8(i8* %p) #0 {
 ; CHECK-LABEL: @or_v16i8(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[P:%.*]] to <16 x i8>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1, [[TBAA6:!tbaa !.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1, !tbaa [[TBAA6:![0-9]+]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = or <16 x i8> [[TMP1]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF4:%.*]] = shufflevector <16 x i8> [[BIN_RDX]], <16 x i8> poison, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@ -141,7 +141,7 @@ define i32 @smin_v4i32(i32* %p) #0 {
 ; CHECK-LABEL: @smin_v4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, [[TBAA0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp slt <4 x i32> [[TMP1]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP1]], <4 x i32> [[RDX_SHUF]]
@ -195,7 +195,7 @@ define i32 @umax_v4i32(i32* %p) #0 {
 ; CHECK-LABEL: @umax_v4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P:%.*]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, [[TBAA0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = icmp ugt <4 x i32> [[TMP1]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x i32> [[TMP1]], <4 x i32> [[RDX_SHUF]]
@ -249,15 +249,14 @@ define float @fadd_v4i32(float* %p) #0 {
 ; CHECK-LABEL: @fadd_v4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, [[TBAA7:!tbaa !.*]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, !tbaa [[TBAA7:![0-9]+]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP1]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF3]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[BIN_RDX5:%.*]] = fadd fast float -0.000000e+00, [[TMP2]]
-; CHECK-NEXT:    [[OP_EXTRA:%.*]] = fadd fast float [[BIN_RDX5]], 4.200000e+01
-; CHECK-NEXT:    ret float [[OP_EXTRA]]
+; CHECK-NEXT:    [[BIN_RDX5:%.*]] = fadd fast float 4.200000e+01, [[TMP2]]
+; CHECK-NEXT:    ret float [[BIN_RDX5]]
 ;
 entry:
  br label %for.cond
@ -290,7 +289,7 @@ define float @fmul_v4i32(float* %p) #0 {
 ; CHECK-LABEL: @fmul_v4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, [[TBAA7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, !tbaa [[TBAA7]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = fmul fast <4 x float> [[TMP1]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
@ -331,7 +330,7 @@ define float @fmin_v4f32(float* %p) #0 {
 ; CHECK-LABEL: @fmin_v4f32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
-; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, [[TBAA7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, !tbaa [[TBAA7]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x float> [[TMP1]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP1]], <4 x float> [[RDX_SHUF]]
@ -410,7 +409,7 @@ cond.end:                                         ; preds = %cond.false, %cond.t
 define float @findMax(<8 x float>* byval(<8 x float>) align 16 %0) {
 ; CHECK-LABEL: @findMax(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[V:%.*]] = load <8 x float>, <8 x float>* [[TMP0:%.*]], align 16, [[TBAA0]]
+; CHECK-NEXT:    [[V:%.*]] = load <8 x float>, <8 x float>* [[TMP0:%.*]], align 16, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[V]], <8 x float> poison, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp nnan ninf nsz ogt <8 x float> [[V]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select nnan ninf nsz <8 x i1> [[RDX_MINMAX_CMP]], <8 x float> [[V]], <8 x float> [[RDX_SHUF]]