[CostModel][X86] Fixed AVX1/AVX512 sdiv/udiv uniformconst costs for 256/512 bit integer vectors

We weren't checking for uniform const costs before the general cost, resulting in very high estimates. llvm-svn: 284755
2024-11-24 03:33:20 +01:00 · 2016-10-20 18:00:35 +00:00 · 2016-10-20 18:00:35 +00:00 · 6773ac2510
commit 6773ac2510
parent 4cd983fcee
2 changed files with 80 additions and 53 deletions
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@ -140,6 +140,30 @@ int X86TTIImpl::getArithmeticInstrCost(
    return Cost;
  }

+  static const CostTblEntry AVX512BWUniformConstCostTable[] = {
+    { ISD::SDIV, MVT::v32i16,  6 }, // vpmulhw sequence
+    { ISD::UDIV, MVT::v32i16,  6 }, // vpmulhuw sequence
+  };
+
+  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+      ST->hasBWI()) {
+    if (const auto *Entry = CostTableLookup(AVX512BWUniformConstCostTable, ISD,
+                                            LT.second))
+      return LT.first * Entry->Cost;
+  }
+
+  static const CostTblEntry AVX512UniformConstCostTable[] = {
+    { ISD::SDIV, MVT::v16i32, 15 }, // vpmuldq sequence
+    { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
+  };
+
+  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+      ST->hasAVX512()) {
+    if (const auto *Entry = CostTableLookup(AVX512UniformConstCostTable, ISD,
+                                            LT.second))
+      return LT.first * Entry->Cost;
+  }
+
  static const CostTblEntry AVX2UniformConstCostTable[] = {
    { ISD::SRA,  MVT::v4i64,   4 }, // 2 x psrad + shuffle.

@ -156,6 +180,30 @@ int X86TTIImpl::getArithmeticInstrCost(
      return LT.first * Entry->Cost;
  }

+  static const CostTblEntry SSE2UniformConstCostTable[] = {
+    { ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence
+    { ISD::SDIV, MVT::v8i16,   6 }, // pmulhw sequence
+    { ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence
+    { ISD::UDIV, MVT::v8i16,   6 }, // pmulhuw sequence
+    { ISD::SDIV, MVT::v8i32,  38 }, // pmuludq sequence
+    { ISD::SDIV, MVT::v4i32,  19 }, // pmuludq sequence
+    { ISD::UDIV, MVT::v8i32,  30 }, // pmuludq sequence
+    { ISD::UDIV, MVT::v4i32,  15 }, // pmuludq sequence
+  };
+
+  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+      ST->hasSSE2()) {
+    // pmuldq sequence.
+    if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX())
+      return LT.first * 30;
+    if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
+      return LT.first * 15;
+
+    if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD,
+                                            LT.second))
+      return LT.first * Entry->Cost;
+  }
+
  static const CostTblEntry AVX512BWCostTable[] = {
    // Vectorizing division is a bad idea. See the SSE2 table for more comments.
    { ISD::SDIV,  MVT::v64i8,  64*20 },
@ -291,15 +339,6 @@ int X86TTIImpl::getArithmeticInstrCost(
      return LT.first * Entry->Cost;
  }

-  static const CostTblEntry
-  SSE2UniformConstCostTable[] = {
-    // Constant splats are cheaper for the following instructions.
-    { ISD::SDIV, MVT::v8i16,  6 }, // pmulhw sequence
-    { ISD::UDIV, MVT::v8i16,  6 }, // pmulhuw sequence
-    { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
-    { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
-  };
-
  static const CostTblEntry
  SSE2UniformCostTable[] = {
    // Uniform splats are cheaper for the following instructions.
@ -334,14 +373,6 @@ int X86TTIImpl::getArithmeticInstrCost(
  if (ST->hasSSE2() &&
      ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) ||
       (Op2Info == TargetTransformInfo::OK_UniformValue))) {
-    if (Op2Info == TargetTransformInfo::OK_UniformConstantValue) {
-      // pmuldq sequence.
-      if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
-        return LT.first * 15;
-      if (const auto *Entry =
-              CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second))
-        return LT.first * Entry->Cost;
-    }
    if (const auto *Entry =
            CostTableLookup(SSE2UniformCostTable, ISD, LT.second))
      return LT.first * Entry->Cost;
--- a/test/Analysis/CostModel/X86/div.ll
+++ b/test/Analysis/CostModel/X86/div.ll
@ -113,8 +113,8 @@ define i32 @udiv() {
  %V64i8 = udiv <64 x i8> undef, undef

  ret i32 undef
-}
-
+}
+
 ; CHECK-LABEL: 'sdiv_uniformconst'
 define i32 @sdiv_uniformconst() {
  ; CHECK: cost of 1 {{.*}} %I64 = sdiv
@ -139,17 +139,16 @@ define i32 @sdiv_uniformconst() {
  ; SSE2: cost of 38 {{.*}} %V8i32 = sdiv
  ; SSSE3: cost of 38 {{.*}} %V8i32 = sdiv
  ; SSE42: cost of 30 {{.*}} %V8i32 = sdiv
-  ; AVX1: cost of 160 {{.*}} %V8i32 = sdiv
+  ; AVX1: cost of 30 {{.*}} %V8i32 = sdiv
  ; AVX2: cost of 15 {{.*}} %V8i32 = sdiv
  ; AVX512: cost of 15 {{.*}} %V8i32 = sdiv
  %V8i32 = sdiv <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
  ; SSE2: cost of 76 {{.*}} %V16i32 = sdiv
  ; SSSE3: cost of 76 {{.*}} %V16i32 = sdiv
  ; SSE42: cost of 60 {{.*}} %V16i32 = sdiv
-  ; AVX1: cost of 320 {{.*}} %V16i32 = sdiv
+  ; AVX1: cost of 60 {{.*}} %V16i32 = sdiv
  ; AVX2: cost of 30 {{.*}} %V16i32 = sdiv
-  ; AVX512F: cost of 48 {{.*}} %V16i32 = sdiv
-  ; AVX512BW: cost of 320 {{.*}} %V16i32 = sdiv
+  ; AVX512: cost of 15 {{.*}} %V16i32 = sdiv
  %V16i32 = sdiv <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>

  ; CHECK: cost of 1 {{.*}} %I16 = sdiv
@ -158,15 +157,15 @@ define i32 @sdiv_uniformconst() {
  ; AVX: cost of 6 {{.*}} %V8i16 = sdiv
  %V8i16 = sdiv <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
  ; SSE: cost of 12 {{.*}} %V16i16 = sdiv
-  ; AVX1: cost of 320 {{.*}} %V16i16 = sdiv
+  ; AVX1: cost of 12 {{.*}} %V16i16 = sdiv
  ; AVX2: cost of 6 {{.*}} %V16i16 = sdiv
  ; AVX512: cost of 6 {{.*}} %V16i16 = sdiv
  %V16i16 = sdiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
  ; SSE: cost of 24 {{.*}} %V32i16 = sdiv
-  ; AVX1: cost of 640 {{.*}} %V32i16 = sdiv
+  ; AVX1: cost of 24 {{.*}} %V32i16 = sdiv
  ; AVX2: cost of 12 {{.*}} %V32i16 = sdiv
  ; AVX512F: cost of 12 {{.*}} %V32i16 = sdiv
-  ; AVX512BW: cost of 640 {{.*}} %V32i16 = sdiv
+  ; AVX512BW: cost of 6 {{.*}} %V32i16 = sdiv
  %V32i16 = sdiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>

  ; CHECK: cost of 1 {{.*}} %I8 = sdiv
@ -182,8 +181,8 @@ define i32 @sdiv_uniformconst() {
  %V64i8 = sdiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>

  ret i32 undef
-}
-
+}
+
 ; CHECK-LABEL: 'udiv_uniformconst'
 define i32 @udiv_uniformconst() {
  ; CHECK: cost of 1 {{.*}} %I64 = udiv
@ -204,15 +203,14 @@ define i32 @udiv_uniformconst() {
  ; AVX: cost of 15 {{.*}} %V4i32 = udiv
  %V4i32 = udiv <4 x i32> undef, <i32 7, i32 7, i32 7, i32 7>
  ; SSE: cost of 30 {{.*}} %V8i32 = udiv
-  ; AVX1: cost of 160 {{.*}} %V8i32 = udiv
+  ; AVX1: cost of 30 {{.*}} %V8i32 = udiv
  ; AVX2: cost of 15 {{.*}} %V8i32 = udiv
  ; AVX512: cost of 15 {{.*}} %V8i32 = udiv
  %V8i32 = udiv <8 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>
  ; SSE: cost of 60 {{.*}} %V16i32 = udiv
-  ; AVX1: cost of 320 {{.*}} %V16i32 = udiv
+  ; AVX1: cost of 60 {{.*}} %V16i32 = udiv
  ; AVX2: cost of 30 {{.*}} %V16i32 = udiv
-  ; AVX512F: cost of 48 {{.*}} %V16i32 = udiv
-  ; AVX512BW: cost of 320 {{.*}} %V16i32 = udiv
+  ; AVX512: cost of 15 {{.*}} %V16i32 = udiv
  %V16i32 = udiv <16 x i32> undef, <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7>

  ; CHECK: cost of 1 {{.*}} %I16 = udiv
@ -221,15 +219,15 @@ define i32 @udiv_uniformconst() {
  ; AVX: cost of 6 {{.*}} %V8i16 = udiv
  %V8i16 = udiv <8 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
  ; SSE: cost of 12 {{.*}} %V16i16 = udiv
-  ; AVX1: cost of 320 {{.*}} %V16i16 = udiv
+  ; AVX1: cost of 12 {{.*}} %V16i16 = udiv
  ; AVX2: cost of 6 {{.*}} %V16i16 = udiv
  ; AVX512: cost of 6 {{.*}} %V16i16 = udiv
  %V16i16 = udiv <16 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
  ; SSE: cost of 24 {{.*}} %V32i16 = udiv
-  ; AVX1: cost of 640 {{.*}} %V32i16 = udiv
+  ; AVX1: cost of 24 {{.*}} %V32i16 = udiv
  ; AVX2: cost of 12 {{.*}} %V32i16 = udiv
  ; AVX512F: cost of 12 {{.*}} %V32i16 = udiv
-  ; AVX512BW: cost of 640 {{.*}} %V32i16 = udiv
+  ; AVX512BW: cost of 6 {{.*}} %V32i16 = udiv
  %V32i16 = udiv <32 x i16> undef, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>

  ; CHECK: cost of 1 {{.*}} %I8 = udiv
@ -245,8 +243,8 @@ define i32 @udiv_uniformconst() {
  %V64i8 = udiv <64 x i8> undef, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>

  ret i32 undef
-}
-
+}
+
 ; CHECK-LABEL: 'sdiv_uniformconstpow2'
 define i32 @sdiv_uniformconstpow2() {
  ; CHECK: cost of 1 {{.*}} %I64 = sdiv
@ -271,17 +269,16 @@ define i32 @sdiv_uniformconstpow2() {
  ; SSE2: cost of 38 {{.*}} %V8i32 = sdiv
  ; SSSE3: cost of 38 {{.*}} %V8i32 = sdiv
  ; SSE42: cost of 30 {{.*}} %V8i32 = sdiv
-  ; AVX1: cost of 160 {{.*}} %V8i32 = sdiv
+  ; AVX1: cost of 30 {{.*}} %V8i32 = sdiv
  ; AVX2: cost of 15 {{.*}} %V8i32 = sdiv
  ; AVX512: cost of 15 {{.*}} %V8i32 = sdiv
  %V8i32 = sdiv <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
  ; SSE2: cost of 76 {{.*}} %V16i32 = sdiv
  ; SSSE3: cost of 76 {{.*}} %V16i32 = sdiv
  ; SSE42: cost of 60 {{.*}} %V16i32 = sdiv
-  ; AVX1: cost of 320 {{.*}} %V16i32 = sdiv
+  ; AVX1: cost of 60 {{.*}} %V16i32 = sdiv
  ; AVX2: cost of 30 {{.*}} %V16i32 = sdiv
-  ; AVX512F: cost of 48 {{.*}} %V16i32 = sdiv
-  ; AVX512BW: cost of 320 {{.*}} %V16i32 = sdiv
+  ; AVX512: cost of 15 {{.*}} %V16i32 = sdiv
  %V16i32 = sdiv <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>

  ; CHECK: cost of 1 {{.*}} %I16 = sdiv
@ -290,15 +287,15 @@ define i32 @sdiv_uniformconstpow2() {
  ; AVX: cost of 6 {{.*}} %V8i16 = sdiv
  %V8i16 = sdiv <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
  ; SSE: cost of 12 {{.*}} %V16i16 = sdiv
-  ; AVX1: cost of 320 {{.*}} %V16i16 = sdiv
+  ; AVX1: cost of 12 {{.*}} %V16i16 = sdiv
  ; AVX2: cost of 6 {{.*}} %V16i16 = sdiv
  ; AVX512: cost of 6 {{.*}} %V16i16 = sdiv
  %V16i16 = sdiv <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
  ; SSE: cost of 24 {{.*}} %V32i16 = sdiv
-  ; AVX1: cost of 640 {{.*}} %V32i16 = sdiv
+  ; AVX1: cost of 24 {{.*}} %V32i16 = sdiv
  ; AVX2: cost of 12 {{.*}} %V32i16 = sdiv
  ; AVX512F: cost of 12 {{.*}} %V32i16 = sdiv
-  ; AVX512BW: cost of 640 {{.*}} %V32i16 = sdiv
+  ; AVX512BW: cost of 6 {{.*}} %V32i16 = sdiv
  %V32i16 = sdiv <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>

  ; CHECK: cost of 1 {{.*}} %I8 = sdiv
@ -315,7 +312,7 @@ define i32 @sdiv_uniformconstpow2() {

  ret i32 undef
 }
-
+
 ; CHECK-LABEL: 'udiv_uniformconstpow2'
 define i32 @udiv_uniformconstpow2() {
  ; CHECK: cost of 1 {{.*}} %I64 = udiv
@ -336,15 +333,14 @@ define i32 @udiv_uniformconstpow2() {
  ; AVX: cost of 15 {{.*}} %V4i32 = udiv
  %V4i32 = udiv <4 x i32> undef, <i32 16, i32 16, i32 16, i32 16>
  ; SSE: cost of 30 {{.*}} %V8i32 = udiv
-  ; AVX1: cost of 160 {{.*}} %V8i32 = udiv
+  ; AVX1: cost of 30 {{.*}} %V8i32 = udiv
  ; AVX2: cost of 15 {{.*}} %V8i32 = udiv
  ; AVX512: cost of 15 {{.*}} %V8i32 = udiv
  %V8i32 = udiv <8 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
  ; SSE: cost of 60 {{.*}} %V16i32 = udiv
-  ; AVX1: cost of 320 {{.*}} %V16i32 = udiv
+  ; AVX1: cost of 60 {{.*}} %V16i32 = udiv
  ; AVX2: cost of 30 {{.*}} %V16i32 = udiv
-  ; AVX512F: cost of 48 {{.*}} %V16i32 = udiv
-  ; AVX512BW: cost of 320 {{.*}} %V16i32 = udiv
+  ; AVX512: cost of 15 {{.*}} %V16i32 = udiv
  %V16i32 = udiv <16 x i32> undef, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>

  ; CHECK: cost of 1 {{.*}} %I16 = udiv
@ -353,15 +349,15 @@ define i32 @udiv_uniformconstpow2() {
  ; AVX: cost of 6 {{.*}} %V8i16 = udiv
  %V8i16 = udiv <8 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
  ; SSE: cost of 12 {{.*}} %V16i16 = udiv
-  ; AVX1: cost of 320 {{.*}} %V16i16 = udiv
+  ; AVX1: cost of 12 {{.*}} %V16i16 = udiv
  ; AVX2: cost of 6 {{.*}} %V16i16 = udiv
  ; AVX512: cost of 6 {{.*}} %V16i16 = udiv
  %V16i16 = udiv <16 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
  ; SSE: cost of 24 {{.*}} %V32i16 = udiv
-  ; AVX1: cost of 640 {{.*}} %V32i16 = udiv
+  ; AVX1: cost of 24 {{.*}} %V32i16 = udiv
  ; AVX2: cost of 12 {{.*}} %V32i16 = udiv
  ; AVX512F: cost of 12 {{.*}} %V32i16 = udiv
-  ; AVX512BW: cost of 640 {{.*}} %V32i16 = udiv
+  ; AVX512BW: cost of 6 {{.*}} %V32i16 = udiv
  %V32i16 = udiv <32 x i16> undef, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>

  ; CHECK: cost of 1 {{.*}} %I8 = udiv
@ -377,4 +373,4 @@ define i32 @udiv_uniformconstpow2() {
  %V64i8 = udiv <64 x i8> undef, <i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16, i8 16>

  ret i32 undef
-}
+}