1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-25 04:02:41 +01:00

[SLPVectorizer] Fix alternate opcode + shuffle cost function to correct handle SK_Select patterns.

We were always using the opcodes of the first 2 scalars for the costs of the alternate opcode + shuffle. This made sense when we used SK_Alternate and opcodes were guaranteed to be alternating, but this fails for the more general SK_Select case.

This fix exposes an issue demonstrated by the fmul_fdiv_v4f32_const test - the SLM model has v4f32 fdiv costs which are more than twice those of the f32 scalar cost, meaning that the cost model determines that the vectorization is not performant. Unfortunately it completely ignores the fact that the fdiv by a constant will be changed into a fmul by InstCombine for a much lower cost vectorization. But at least we're seeing this now...

llvm-svn: 336095
This commit is contained in:
Simon Pilgrim 2018-07-02 11:28:01 +00:00
parent a0b4decfe6
commit ffa8d2ee7c
2 changed files with 27 additions and 7 deletions

View File

@ -2375,14 +2375,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
Instruction *I = cast<Instruction>(i);
if (!I)
break;
assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
ScalarCost += TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy);
}
// VecCost is equal to sum of the cost of creating 2 vectors
// and the cost of creating shuffle.
Instruction *I0 = cast<Instruction>(VL[0]);
VecCost = TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy);
Instruction *I1 = cast<Instruction>(VL[1]);
VecCost += TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy);
VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy);
VecCost += TTI->getArithmeticInstrCost(S.AltOpcode, VecTy);
VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
return ReuseShuffleCost + VecCost - ScalarCost;
}

View File

@ -120,9 +120,30 @@ define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
}
define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) {
; CHECK-LABEL: @fmul_fdiv_v4f32_const(
; CHECK-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
; CHECK-NEXT: ret <4 x float> [[TMP1]]
; SSE-LABEL: @fmul_fdiv_v4f32_const(
; SSE-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
; SSE-NEXT: ret <4 x float> [[TMP1]]
;
; SLM-LABEL: @fmul_fdiv_v4f32_const(
; SLM-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
; SLM-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
; SLM-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
; SLM-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
; SLM-NEXT: [[AB0:%.*]] = fmul float [[A0]], 2.000000e+00
; SLM-NEXT: [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00
; SLM-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[AB0]], i32 0
; SLM-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[A1]], i32 1
; SLM-NEXT: [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[A2]], i32 2
; SLM-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3
; SLM-NEXT: ret <4 x float> [[R3]]
;
; AVX-LABEL: @fmul_fdiv_v4f32_const(
; AVX-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
; AVX-NEXT: ret <4 x float> [[TMP1]]
;
; AVX512-LABEL: @fmul_fdiv_v4f32_const(
; AVX512-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
; AVX512-NEXT: ret <4 x float> [[TMP1]]
;
%a0 = extractelement <4 x float> %a, i32 0
%a1 = extractelement <4 x float> %a, i32 1