[SLPVectorizer] Fix alternate opcode + shuffle cost function to correct handle SK_Select patterns.

We were always using the opcodes of the first 2 scalars for the costs of the alternate opcode + shuffle. This made sense when we used SK_Alternate and opcodes were guaranteed to be alternating, but this fails for the more general SK_Select case. This fix exposes an issue demonstrated by the fmul_fdiv_v4f32_const test - the SLM model has v4f32 fdiv costs which are more than twice those of the f32 scalar cost, meaning that the cost model determines that the vectorization is not performant. Unfortunately it completely ignores the fact that the fdiv by a constant will be changed into a fmul by InstCombine for a much lower cost vectorization. But at least we're seeing this now... llvm-svn: 336095
2024-11-25 04:02:41 +01:00 · 2018-07-02 11:28:01 +00:00 · 2018-07-02 11:28:01 +00:00 · ffa8d2ee7c
commit ffa8d2ee7c
parent a0b4decfe6
2 changed files with 27 additions and 7 deletions
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@ -2375,14 +2375,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
        Instruction *I = cast<Instruction>(i);
        if (!I)
          break;
+        assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
        ScalarCost += TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy);
      }
      // VecCost is equal to sum of the cost of creating 2 vectors
      // and the cost of creating shuffle.
-      Instruction *I0 = cast<Instruction>(VL[0]);
-      VecCost = TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy);
-      Instruction *I1 = cast<Instruction>(VL[1]);
-      VecCost += TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy);
+      VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy);
+      VecCost += TTI->getArithmeticInstrCost(S.AltOpcode, VecTy);
      VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
      return ReuseShuffleCost + VecCost - ScalarCost;
    }
--- a/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
+++ b/test/Transforms/SLPVectorizer/X86/alternate-fp.ll
@ -120,9 +120,30 @@ define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
 }

 define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) {
-; CHECK-LABEL: @fmul_fdiv_v4f32_const(
-; CHECK-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
-; CHECK-NEXT:    ret <4 x float> [[TMP1]]
+; SSE-LABEL: @fmul_fdiv_v4f32_const(
+; SSE-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; SSE-NEXT:    ret <4 x float> [[TMP1]]
+;
+; SLM-LABEL: @fmul_fdiv_v4f32_const(
+; SLM-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; SLM-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; SLM-NEXT:    [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
+; SLM-NEXT:    [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
+; SLM-NEXT:    [[AB0:%.*]] = fmul float [[A0]], 2.000000e+00
+; SLM-NEXT:    [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00
+; SLM-NEXT:    [[R0:%.*]] = insertelement <4 x float> undef, float [[AB0]], i32 0
+; SLM-NEXT:    [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[A1]], i32 1
+; SLM-NEXT:    [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[A2]], i32 2
+; SLM-NEXT:    [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3
+; SLM-NEXT:    ret <4 x float> [[R3]]
+;
+; AVX-LABEL: @fmul_fdiv_v4f32_const(
+; AVX-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; AVX-NEXT:    ret <4 x float> [[TMP1]]
+;
+; AVX512-LABEL: @fmul_fdiv_v4f32_const(
+; AVX512-NEXT:    [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
+; AVX512-NEXT:    ret <4 x float> [[TMP1]]
 ;
  %a0 = extractelement <4 x float> %a, i32 0
  %a1 = extractelement <4 x float> %a, i32 1