mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-25 04:02:41 +01:00
[SLPVectorizer] Fix alternate opcode + shuffle cost function to correct handle SK_Select patterns.
We were always using the opcodes of the first 2 scalars for the costs of the alternate opcode + shuffle. This made sense when we used SK_Alternate and opcodes were guaranteed to be alternating, but this fails for the more general SK_Select case. This fix exposes an issue demonstrated by the fmul_fdiv_v4f32_const test - the SLM model has v4f32 fdiv costs which are more than twice those of the f32 scalar cost, meaning that the cost model determines that the vectorization is not performant. Unfortunately it completely ignores the fact that the fdiv by a constant will be changed into a fmul by InstCombine for a much lower cost vectorization. But at least we're seeing this now... llvm-svn: 336095
This commit is contained in:
parent
a0b4decfe6
commit
ffa8d2ee7c
@ -2375,14 +2375,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
|
||||
Instruction *I = cast<Instruction>(i);
|
||||
if (!I)
|
||||
break;
|
||||
assert(S.isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
|
||||
ScalarCost += TTI->getArithmeticInstrCost(I->getOpcode(), ScalarTy);
|
||||
}
|
||||
// VecCost is equal to sum of the cost of creating 2 vectors
|
||||
// and the cost of creating shuffle.
|
||||
Instruction *I0 = cast<Instruction>(VL[0]);
|
||||
VecCost = TTI->getArithmeticInstrCost(I0->getOpcode(), VecTy);
|
||||
Instruction *I1 = cast<Instruction>(VL[1]);
|
||||
VecCost += TTI->getArithmeticInstrCost(I1->getOpcode(), VecTy);
|
||||
VecCost = TTI->getArithmeticInstrCost(S.Opcode, VecTy);
|
||||
VecCost += TTI->getArithmeticInstrCost(S.AltOpcode, VecTy);
|
||||
VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
|
||||
return ReuseShuffleCost + VecCost - ScalarCost;
|
||||
}
|
||||
|
@ -120,9 +120,30 @@ define <8 x float> @fmul_fdiv_v8f32(<8 x float> %a, <8 x float> %b) {
|
||||
}
|
||||
|
||||
define <4 x float> @fmul_fdiv_v4f32_const(<4 x float> %a) {
|
||||
; CHECK-LABEL: @fmul_fdiv_v4f32_const(
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
|
||||
; CHECK-NEXT: ret <4 x float> [[TMP1]]
|
||||
; SSE-LABEL: @fmul_fdiv_v4f32_const(
|
||||
; SSE-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
|
||||
; SSE-NEXT: ret <4 x float> [[TMP1]]
|
||||
;
|
||||
; SLM-LABEL: @fmul_fdiv_v4f32_const(
|
||||
; SLM-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
|
||||
; SLM-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
|
||||
; SLM-NEXT: [[A2:%.*]] = extractelement <4 x float> [[A]], i32 2
|
||||
; SLM-NEXT: [[A3:%.*]] = extractelement <4 x float> [[A]], i32 3
|
||||
; SLM-NEXT: [[AB0:%.*]] = fmul float [[A0]], 2.000000e+00
|
||||
; SLM-NEXT: [[AB3:%.*]] = fmul float [[A3]], 2.000000e+00
|
||||
; SLM-NEXT: [[R0:%.*]] = insertelement <4 x float> undef, float [[AB0]], i32 0
|
||||
; SLM-NEXT: [[R1:%.*]] = insertelement <4 x float> [[R0]], float [[A1]], i32 1
|
||||
; SLM-NEXT: [[R2:%.*]] = insertelement <4 x float> [[R1]], float [[A2]], i32 2
|
||||
; SLM-NEXT: [[R3:%.*]] = insertelement <4 x float> [[R2]], float [[AB3]], i32 3
|
||||
; SLM-NEXT: ret <4 x float> [[R3]]
|
||||
;
|
||||
; AVX-LABEL: @fmul_fdiv_v4f32_const(
|
||||
; AVX-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
|
||||
; AVX-NEXT: ret <4 x float> [[TMP1]]
|
||||
;
|
||||
; AVX512-LABEL: @fmul_fdiv_v4f32_const(
|
||||
; AVX512-NEXT: [[TMP1:%.*]] = fmul <4 x float> [[A:%.*]], <float 2.000000e+00, float 1.000000e+00, float 1.000000e+00, float 2.000000e+00>
|
||||
; AVX512-NEXT: ret <4 x float> [[TMP1]]
|
||||
;
|
||||
%a0 = extractelement <4 x float> %a, i32 0
|
||||
%a1 = extractelement <4 x float> %a, i32 1
|
||||
|
Loading…
Reference in New Issue
Block a user