1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 03:02:36 +01:00

[CostModel][X86] Improve accuracy of vXi8 multiply reduction costs

After rG47321c311bdbe0145b9bf45d822185c37b19fa50 we promote vXi8 reductions to vXi16 to create a much faster PMULLW mul reduction, followed by a (free) truncation. This avoids the high cost of repeated vXi8 multiplications (which extend+multiply+truncate to/from vXi16 types....).

Fixes the missing vXi8 mul reduction vectorization in PR42674 (Comment #20) 'mul16' test case.
This commit is contained in:
Simon Pilgrim 2021-04-06 11:40:37 +01:00
parent d2ae27100c
commit d50aefd3c1
2 changed files with 74 additions and 44 deletions

View File

@ -3408,6 +3408,16 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
auto *ValVTy = cast<FixedVectorType>(ValTy);
// Special case: vXi8 mul reductions are performed as vXi16.
if (ISD == ISD::MUL && MTy.getScalarType() == MVT::i8) {
auto *WideSclTy = IntegerType::get(ValVTy->getContext(), 16);
auto *WideVecTy = FixedVectorType::get(WideSclTy, ValVTy->getNumElements());
return getCastInstrCost(Instruction::ZExt, WideVecTy, ValTy,
TargetTransformInfo::CastContextHint::None,
CostKind) +
getArithmeticReductionCost(Opcode, WideVecTy, IsPairwise, CostKind);
}
unsigned ArithmeticCost = 0;
if (LT.first != 1 && MTy.isVector() &&
MTy.getVectorNumElements() < ValVTy->getNumElements()) {

View File

@ -187,64 +187,84 @@ define i32 @reduce_i16(i32 %arg) {
}
define i32 @reduce_i8(i32 %arg) {
; SSE-LABEL: 'reduce_i8'
; SSE-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
; SSE-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
; SSE-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
; SSE-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
; SSE-NEXT: Cost Model: Found an estimated cost of 65 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
; SSE-NEXT: Cost Model: Found an estimated cost of 89 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
; SSE-NEXT: Cost Model: Found an estimated cost of 137 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
; SSE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
; SSE2-LABEL: 'reduce_i8'
; SSE2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
; SSE2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSSE3-LABEL: 'reduce_i8'
; SSSE3-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 46 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
; SSSE3-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; SSE42-LABEL: 'reduce_i8'
; SSE42-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
; SSE42-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX1-LABEL: 'reduce_i8'
; AVX1-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 40 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 53 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 66 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 144 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 22 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 39 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 73 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
; AVX1-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX2-LABEL: 'reduce_i8'
; AVX2-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 17 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 41 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 58 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 92 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 18 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 28 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
; AVX2-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512F-LABEL: 'reduce_i8'
; AVX512F-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
; AVX512F-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512BW-LABEL: 'reduce_i8'
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 26 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 42 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
; AVX512BW-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
; AVX512DQ-LABEL: 'reduce_i8'
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 13 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 45 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %V8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V16 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 14 for instruction: %V32 = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V64 = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 31 for instruction: %V128 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef)
; AVX512DQ-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
;
%V2 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef)