1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-20 03:23:01 +02:00

[CostModel][X86] Add CTPOP scalar costs (PR43656)

Add specific scalar costs for ctpop instructions, these are based on the llvm-mca's SLM throughput numbers (the oldest model we have).

For targets supporting POPCNT, we provide overrides that assume 1cy costs.

llvm-svn: 374775
This commit is contained in:
Simon Pilgrim 2019-10-14 14:07:43 +00:00
parent 4f6c7c9728
commit 92f62b5161
3 changed files with 73 additions and 26 deletions

View File

@ -2103,8 +2103,17 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::FSQRT, MVT::f32, 28 }, // Pentium III from http://www.agner.org/
{ ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
};
static const CostTblEntry POPCNT64CostTbl[] = { // 64-bit targets
{ ISD::CTPOP, MVT::i64, 1 },
};
static const CostTblEntry POPCNT32CostTbl[] = { // 32 or 64-bit targets
{ ISD::CTPOP, MVT::i32, 1 },
{ ISD::CTPOP, MVT::i16, 1 },
{ ISD::CTPOP, MVT::i8, 1 },
};
static const CostTblEntry X64CostTbl[] = { // 64-bit targets
{ ISD::BITREVERSE, MVT::i64, 14 },
{ ISD::CTPOP, MVT::i64, 10 },
{ ISD::SADDO, MVT::i64, 1 },
{ ISD::UADDO, MVT::i64, 1 },
};
@ -2112,6 +2121,9 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
{ ISD::BITREVERSE, MVT::i32, 14 },
{ ISD::BITREVERSE, MVT::i16, 14 },
{ ISD::BITREVERSE, MVT::i8, 11 },
{ ISD::CTPOP, MVT::i32, 8 },
{ ISD::CTPOP, MVT::i16, 9 },
{ ISD::CTPOP, MVT::i8, 7 },
{ ISD::SADDO, MVT::i32, 1 },
{ ISD::SADDO, MVT::i16, 1 },
{ ISD::SADDO, MVT::i8, 1 },
@ -2223,6 +2235,17 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
return LT.first * Entry->Cost;
if (ST->hasPOPCNT()) {
if (ST->is64Bit())
if (const auto *Entry = CostTableLookup(POPCNT64CostTbl, ISD, MTy))
return LT.first * Entry->Cost;
if (const auto *Entry = CostTableLookup(POPCNT32CostTbl, ISD, MTy))
return LT.first * Entry->Cost;
}
// TODO - add LZCNT and BMI (TZCNT) scalar handling
if (ST->is64Bit())
if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
return LT.first * Entry->Cost;

View File

@ -16,7 +16,7 @@ declare i8 @llvm.ctpop.i8(i8)
define i64 @var_ctpop_i64(i64 %a) {
; NOPOPCNT-LABEL: 'var_ctpop_i64'
; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctpop = call i64 @llvm.ctpop.i64(i64 %a)
; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %ctpop = call i64 @llvm.ctpop.i64(i64 %a)
; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i64 %ctpop
;
; POPCNT-LABEL: 'var_ctpop_i64'
@ -29,7 +29,7 @@ define i64 @var_ctpop_i64(i64 %a) {
define i32 @var_ctpop_i32(i32 %a) {
; NOPOPCNT-LABEL: 'var_ctpop_i32'
; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctpop = call i32 @llvm.ctpop.i32(i32 %a)
; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %ctpop = call i32 @llvm.ctpop.i32(i32 %a)
; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i32 %ctpop
;
; POPCNT-LABEL: 'var_ctpop_i32'
@ -42,7 +42,7 @@ define i32 @var_ctpop_i32(i32 %a) {
define i16 @var_ctpop_i16(i16 %a) {
; NOPOPCNT-LABEL: 'var_ctpop_i16'
; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctpop = call i16 @llvm.ctpop.i16(i16 %a)
; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %ctpop = call i16 @llvm.ctpop.i16(i16 %a)
; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i16 %ctpop
;
; POPCNT-LABEL: 'var_ctpop_i16'
@ -55,7 +55,7 @@ define i16 @var_ctpop_i16(i16 %a) {
define i8 @var_ctpop_i8(i8 %a) {
; NOPOPCNT-LABEL: 'var_ctpop_i8'
; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %ctpop = call i8 @llvm.ctpop.i8(i8 %a)
; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %ctpop = call i8 @llvm.ctpop.i8(i8 %a)
; NOPOPCNT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret i8 %ctpop
;
; POPCNT-LABEL: 'var_ctpop_i8'

View File

@ -21,14 +21,29 @@ declare i16 @llvm.ctpop.i16(i16)
declare i8 @llvm.ctpop.i8(i8)
define void @ctpop_2i64() #0 {
; CHECK-LABEL: @ctpop_2i64(
; CHECK-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
; CHECK-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
; CHECK-NEXT: [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]])
; CHECK-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
; CHECK-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
; CHECK-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
; CHECK-NEXT: ret void
; SSE2-LABEL: @ctpop_2i64(
; SSE2-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8
; SSE2-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP1]])
; SSE2-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
; SSE2-NEXT: ret void
;
; SSE42-LABEL: @ctpop_2i64(
; SSE42-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
; SSE42-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
; SSE42-NEXT: [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]])
; SSE42-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
; SSE42-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
; SSE42-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
; SSE42-NEXT: ret void
;
; AVX-LABEL: @ctpop_2i64(
; AVX-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
; AVX-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
; AVX-NEXT: [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]])
; AVX-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
; AVX-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
; AVX-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
; AVX-NEXT: ret void
;
%ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
%ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
@ -40,20 +55,29 @@ define void @ctpop_2i64() #0 {
}
define void @ctpop_4i64() #0 {
; SSE-LABEL: @ctpop_4i64(
; SSE-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
; SSE-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
; SSE-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
; SSE-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
; SSE-NEXT: [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]])
; SSE-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
; SSE-NEXT: [[CTPOP2:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD2]])
; SSE-NEXT: [[CTPOP3:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD3]])
; SSE-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
; SSE-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
; SSE-NEXT: store i64 [[CTPOP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
; SSE-NEXT: store i64 [[CTPOP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
; SSE-NEXT: ret void
; SSE2-LABEL: @ctpop_4i64(
; SSE2-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 4
; SSE2-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4
; SSE2-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP1]])
; SSE2-NEXT: [[TMP4:%.*]] = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> [[TMP2]])
; SSE2-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4
; SSE2-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4
; SSE2-NEXT: ret void
;
; SSE42-LABEL: @ctpop_4i64(
; SSE42-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
; SSE42-NEXT: [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
; SSE42-NEXT: [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
; SSE42-NEXT: [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
; SSE42-NEXT: [[CTPOP0:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD0]])
; SSE42-NEXT: [[CTPOP1:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD1]])
; SSE42-NEXT: [[CTPOP2:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD2]])
; SSE42-NEXT: [[CTPOP3:%.*]] = call i64 @llvm.ctpop.i64(i64 [[LD3]])
; SSE42-NEXT: store i64 [[CTPOP0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
; SSE42-NEXT: store i64 [[CTPOP1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
; SSE42-NEXT: store i64 [[CTPOP2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
; SSE42-NEXT: store i64 [[CTPOP3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
; SSE42-NEXT: ret void
;
; AVX1-LABEL: @ctpop_4i64(
; AVX1-NEXT: [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4