diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 11bce7c46f5..e14220807c8 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -140,6 +140,30 @@ int X86TTIImpl::getArithmeticInstrCost( return Cost; } + static const CostTblEntry AVX512BWUniformConstCostTable[] = { + { ISD::SDIV, MVT::v32i16, 6 }, // vpmulhw sequence + { ISD::UDIV, MVT::v32i16, 6 }, // vpmulhuw sequence + }; + + if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && + ST->hasBWI()) { + if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD, + LT.second)) + return LT.first * Entry->Cost; + } + + static const CostTblEntry AVX512UniformConstCostTable[] = { + { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence + { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence + }; + + if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && + ST->hasAVX512()) { + if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD, + LT.second)) + return LT.first * Entry->Cost; + } + static const CostTblEntry AVX2UniformConstCostTable[] = { { ISD::SRA, MVT::v4i64, 4 }, // 2 x psrad + shuffle. @@ -156,6 +180,30 @@ int X86TTIImpl::getArithmeticInstrCost( return LT.first * Entry->Cost; } + static const CostTblEntry SSE2UniformConstCostTable[] = { + { ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence + { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence + { ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence + { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence + { ISD::SDIV, MVT::v8i32, 38 }, // pmuludq sequence + { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence + { ISD::UDIV, MVT::v8i32, 30 }, // pmuludq sequence + { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence + }; + + if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && + ST->hasSSE2()) { + // pmuldq sequence. + if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX()) + return LT.first * 30; + if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) + return LT.first * 15; + + if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD, + LT.second)) + return LT.first * Entry->Cost; + } + static const CostTblEntry AVX512BWCostTable[] = { // Vectorizing division is a bad idea. See the SSE2 table for more comments. { ISD::SDIV, MVT::v64i8, 64*20 }, @@ -291,15 +339,6 @@ int X86TTIImpl::getArithmeticInstrCost( return LT.first * Entry->Cost; } - static const CostTblEntry - SSE2UniformConstCostTable[] = { - // Constant splats are cheaper for the following instructions. - { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence - { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence - { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence - { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence - }; - static const CostTblEntry SSE2UniformCostTable[] = { // Uniform splats are cheaper for the following instructions. @@ -334,14 +373,6 @@ int X86TTIImpl::getArithmeticInstrCost( if (ST->hasSSE2() && ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || (Op2Info == TargetTransformInfo::OK_UniformValue))) { - if (Op2Info == TargetTransformInfo::OK_UniformConstantValue) { - // pmuldq sequence. - if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) - return LT.first * 15; - if (const auto *Entry = - CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) - return LT.first * Entry->Cost; - } if (const auto *Entry = CostTableLookup(SSE2UniformCostTable, ISD, LT.second)) return LT.first * Entry->Cost; diff --git a/test/Analysis/CostModel/X86/div.ll b/test/Analysis/CostModel/X86/div.ll index 620420c24b2..0ac06ff75eb 100644 --- a/test/Analysis/CostModel/X86/div.ll +++ b/test/Analysis/CostModel/X86/div.ll @@ -113,8 +113,8 @@ define i32 @udiv() { %V64i8 = udiv <64 x i8> undef, undef ret i32 undef -} - +} + ; CHECK-LABEL: 'sdiv_uniformconst' define i32 @sdiv_uniformconst() { ; CHECK: cost of 1 {{.*}} %I64 = sdiv @@ -139,17 +139,16 @@ define i32 @sdiv_uniformconst() { ; SSE2: cost of 38 {{.*}} %V8i32 = sdiv ; SSSE3: cost of 38 {{.*}} %V8i32 = sdiv ; SSE42: cost of 30 {{.*}} %V8i32 = sdiv - ; AVX1: cost of 160 {{.*}} %V8i32 = sdiv + ; AVX1: cost of 30 {{.*}} %V8i32 = sdiv ; AVX2: cost of 15 {{.*}} %V8i32 = sdiv ; AVX512: cost of 15 {{.*}} %V8i32 = sdiv %V8i32 = sdiv <8 x i32> undef, ; SSE2: cost of 76 {{.*}} %V16i32 = sdiv ; SSSE3: cost of 76 {{.*}} %V16i32 = sdiv ; SSE42: cost of 60 {{.*}} %V16i32 = sdiv - ; AVX1: cost of 320 {{.*}} %V16i32 = sdiv + ; AVX1: cost of 60 {{.*}} %V16i32 = sdiv ; AVX2: cost of 30 {{.*}} %V16i32 = sdiv - ; AVX512F: cost of 48 {{.*}} %V16i32 = sdiv - ; AVX512BW: cost of 320 {{.*}} %V16i32 = sdiv + ; AVX512: cost of 15 {{.*}} %V16i32 = sdiv %V16i32 = sdiv <16 x i32> undef, ; CHECK: cost of 1 {{.*}} %I16 = sdiv @@ -158,15 +157,15 @@ define i32 @sdiv_uniformconst() { ; AVX: cost of 6 {{.*}} %V8i16 = sdiv %V8i16 = sdiv <8 x i16> undef, ; SSE: cost of 12 {{.*}} %V16i16 = sdiv - ; AVX1: cost of 320 {{.*}} %V16i16 = sdiv + ; AVX1: cost of 12 {{.*}} %V16i16 = sdiv ; AVX2: cost of 6 {{.*}} %V16i16 = sdiv ; AVX512: cost of 6 {{.*}} %V16i16 = sdiv %V16i16 = sdiv <16 x i16> undef, ; SSE: cost of 24 {{.*}} %V32i16 = sdiv - ; AVX1: cost of 640 {{.*}} %V32i16 = sdiv + ; AVX1: cost of 24 {{.*}} %V32i16 = sdiv ; AVX2: cost of 12 {{.*}} %V32i16 = sdiv ; AVX512F: cost of 12 {{.*}} %V32i16 = sdiv - ; AVX512BW: cost of 640 {{.*}} %V32i16 = sdiv + ; AVX512BW: cost of 6 {{.*}} %V32i16 = sdiv %V32i16 = sdiv <32 x i16> undef, ; CHECK: cost of 1 {{.*}} %I8 = sdiv @@ -182,8 +181,8 @@ define i32 @sdiv_uniformconst() { %V64i8 = sdiv <64 x i8> undef, ret i32 undef -} - +} + ; CHECK-LABEL: 'udiv_uniformconst' define i32 @udiv_uniformconst() { ; CHECK: cost of 1 {{.*}} %I64 = udiv @@ -204,15 +203,14 @@ define i32 @udiv_uniformconst() { ; AVX: cost of 15 {{.*}} %V4i32 = udiv %V4i32 = udiv <4 x i32> undef, ; SSE: cost of 30 {{.*}} %V8i32 = udiv - ; AVX1: cost of 160 {{.*}} %V8i32 = udiv + ; AVX1: cost of 30 {{.*}} %V8i32 = udiv ; AVX2: cost of 15 {{.*}} %V8i32 = udiv ; AVX512: cost of 15 {{.*}} %V8i32 = udiv %V8i32 = udiv <8 x i32> undef, ; SSE: cost of 60 {{.*}} %V16i32 = udiv - ; AVX1: cost of 320 {{.*}} %V16i32 = udiv + ; AVX1: cost of 60 {{.*}} %V16i32 = udiv ; AVX2: cost of 30 {{.*}} %V16i32 = udiv - ; AVX512F: cost of 48 {{.*}} %V16i32 = udiv - ; AVX512BW: cost of 320 {{.*}} %V16i32 = udiv + ; AVX512: cost of 15 {{.*}} %V16i32 = udiv %V16i32 = udiv <16 x i32> undef, ; CHECK: cost of 1 {{.*}} %I16 = udiv @@ -221,15 +219,15 @@ define i32 @udiv_uniformconst() { ; AVX: cost of 6 {{.*}} %V8i16 = udiv %V8i16 = udiv <8 x i16> undef, ; SSE: cost of 12 {{.*}} %V16i16 = udiv - ; AVX1: cost of 320 {{.*}} %V16i16 = udiv + ; AVX1: cost of 12 {{.*}} %V16i16 = udiv ; AVX2: cost of 6 {{.*}} %V16i16 = udiv ; AVX512: cost of 6 {{.*}} %V16i16 = udiv %V16i16 = udiv <16 x i16> undef, ; SSE: cost of 24 {{.*}} %V32i16 = udiv - ; AVX1: cost of 640 {{.*}} %V32i16 = udiv + ; AVX1: cost of 24 {{.*}} %V32i16 = udiv ; AVX2: cost of 12 {{.*}} %V32i16 = udiv ; AVX512F: cost of 12 {{.*}} %V32i16 = udiv - ; AVX512BW: cost of 640 {{.*}} %V32i16 = udiv + ; AVX512BW: cost of 6 {{.*}} %V32i16 = udiv %V32i16 = udiv <32 x i16> undef, ; CHECK: cost of 1 {{.*}} %I8 = udiv @@ -245,8 +243,8 @@ define i32 @udiv_uniformconst() { %V64i8 = udiv <64 x i8> undef, ret i32 undef -} - +} + ; CHECK-LABEL: 'sdiv_uniformconstpow2' define i32 @sdiv_uniformconstpow2() { ; CHECK: cost of 1 {{.*}} %I64 = sdiv @@ -271,17 +269,16 @@ define i32 @sdiv_uniformconstpow2() { ; SSE2: cost of 38 {{.*}} %V8i32 = sdiv ; SSSE3: cost of 38 {{.*}} %V8i32 = sdiv ; SSE42: cost of 30 {{.*}} %V8i32 = sdiv - ; AVX1: cost of 160 {{.*}} %V8i32 = sdiv + ; AVX1: cost of 30 {{.*}} %V8i32 = sdiv ; AVX2: cost of 15 {{.*}} %V8i32 = sdiv ; AVX512: cost of 15 {{.*}} %V8i32 = sdiv %V8i32 = sdiv <8 x i32> undef, ; SSE2: cost of 76 {{.*}} %V16i32 = sdiv ; SSSE3: cost of 76 {{.*}} %V16i32 = sdiv ; SSE42: cost of 60 {{.*}} %V16i32 = sdiv - ; AVX1: cost of 320 {{.*}} %V16i32 = sdiv + ; AVX1: cost of 60 {{.*}} %V16i32 = sdiv ; AVX2: cost of 30 {{.*}} %V16i32 = sdiv - ; AVX512F: cost of 48 {{.*}} %V16i32 = sdiv - ; AVX512BW: cost of 320 {{.*}} %V16i32 = sdiv + ; AVX512: cost of 15 {{.*}} %V16i32 = sdiv %V16i32 = sdiv <16 x i32> undef, ; CHECK: cost of 1 {{.*}} %I16 = sdiv @@ -290,15 +287,15 @@ define i32 @sdiv_uniformconstpow2() { ; AVX: cost of 6 {{.*}} %V8i16 = sdiv %V8i16 = sdiv <8 x i16> undef, ; SSE: cost of 12 {{.*}} %V16i16 = sdiv - ; AVX1: cost of 320 {{.*}} %V16i16 = sdiv + ; AVX1: cost of 12 {{.*}} %V16i16 = sdiv ; AVX2: cost of 6 {{.*}} %V16i16 = sdiv ; AVX512: cost of 6 {{.*}} %V16i16 = sdiv %V16i16 = sdiv <16 x i16> undef, ; SSE: cost of 24 {{.*}} %V32i16 = sdiv - ; AVX1: cost of 640 {{.*}} %V32i16 = sdiv + ; AVX1: cost of 24 {{.*}} %V32i16 = sdiv ; AVX2: cost of 12 {{.*}} %V32i16 = sdiv ; AVX512F: cost of 12 {{.*}} %V32i16 = sdiv - ; AVX512BW: cost of 640 {{.*}} %V32i16 = sdiv + ; AVX512BW: cost of 6 {{.*}} %V32i16 = sdiv %V32i16 = sdiv <32 x i16> undef, ; CHECK: cost of 1 {{.*}} %I8 = sdiv @@ -315,7 +312,7 @@ define i32 @sdiv_uniformconstpow2() { ret i32 undef } - + ; CHECK-LABEL: 'udiv_uniformconstpow2' define i32 @udiv_uniformconstpow2() { ; CHECK: cost of 1 {{.*}} %I64 = udiv @@ -336,15 +333,14 @@ define i32 @udiv_uniformconstpow2() { ; AVX: cost of 15 {{.*}} %V4i32 = udiv %V4i32 = udiv <4 x i32> undef, ; SSE: cost of 30 {{.*}} %V8i32 = udiv - ; AVX1: cost of 160 {{.*}} %V8i32 = udiv + ; AVX1: cost of 30 {{.*}} %V8i32 = udiv ; AVX2: cost of 15 {{.*}} %V8i32 = udiv ; AVX512: cost of 15 {{.*}} %V8i32 = udiv %V8i32 = udiv <8 x i32> undef, ; SSE: cost of 60 {{.*}} %V16i32 = udiv - ; AVX1: cost of 320 {{.*}} %V16i32 = udiv + ; AVX1: cost of 60 {{.*}} %V16i32 = udiv ; AVX2: cost of 30 {{.*}} %V16i32 = udiv - ; AVX512F: cost of 48 {{.*}} %V16i32 = udiv - ; AVX512BW: cost of 320 {{.*}} %V16i32 = udiv + ; AVX512: cost of 15 {{.*}} %V16i32 = udiv %V16i32 = udiv <16 x i32> undef, ; CHECK: cost of 1 {{.*}} %I16 = udiv @@ -353,15 +349,15 @@ define i32 @udiv_uniformconstpow2() { ; AVX: cost of 6 {{.*}} %V8i16 = udiv %V8i16 = udiv <8 x i16> undef, ; SSE: cost of 12 {{.*}} %V16i16 = udiv - ; AVX1: cost of 320 {{.*}} %V16i16 = udiv + ; AVX1: cost of 12 {{.*}} %V16i16 = udiv ; AVX2: cost of 6 {{.*}} %V16i16 = udiv ; AVX512: cost of 6 {{.*}} %V16i16 = udiv %V16i16 = udiv <16 x i16> undef, ; SSE: cost of 24 {{.*}} %V32i16 = udiv - ; AVX1: cost of 640 {{.*}} %V32i16 = udiv + ; AVX1: cost of 24 {{.*}} %V32i16 = udiv ; AVX2: cost of 12 {{.*}} %V32i16 = udiv ; AVX512F: cost of 12 {{.*}} %V32i16 = udiv - ; AVX512BW: cost of 640 {{.*}} %V32i16 = udiv + ; AVX512BW: cost of 6 {{.*}} %V32i16 = udiv %V32i16 = udiv <32 x i16> undef, ; CHECK: cost of 1 {{.*}} %I8 = udiv @@ -377,4 +373,4 @@ define i32 @udiv_uniformconstpow2() { %V64i8 = udiv <64 x i8> undef, ret i32 undef -} +}