[CostModel][X86] Fix SLM <2 x i64> icmp costs

SLM is 2 x slower for <2 x i64> comparison ops than other vector types, we should account for this like we do for SLM <2 x i64> add/sub/mul costs. This should remove some of the SLM codegen diffs in D43582 llvm-svn: 372954
2024-11-23 19:23:23 +01:00 · 2019-09-26 10:14:38 +00:00 · 2019-09-26 10:14:38 +00:00 · de70987193
commit de70987193
parent 0ad10a49c7
9 changed files with 358 additions and 87 deletions
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@ -1797,6 +1797,11 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
    }
  }

+  static const CostTblEntry SLMCostTbl[] = {
+    // slm pcmpeq/pcmpgt throughput is 2
+    { ISD::SETCC,   MVT::v2i64,   2 },
+  };
+
  static const CostTblEntry AVX512BWCostTbl[] = {
    { ISD::SETCC,   MVT::v32i16,  1 },
    { ISD::SETCC,   MVT::v64i8,   1 },
@ -1883,6 +1888,10 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
    { ISD::SELECT,  MVT::v4f32,   3 }, // andps + andnps + orps
  };

+  if (ST->isSLM())
+    if (const auto *Entry = CostTableLookup(SLMCostTbl, ISD, MTy))
+      return LT.first * (ExtraCost + Entry->Cost);
+
  if (ST->hasBWI())
    if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
      return LT.first * (ExtraCost + Entry->Cost);
--- a/test/Analysis/CostModel/X86/arith-overflow.ll
+++ b/test/Analysis/CostModel/X86/arith-overflow.ll
@ -171,9 +171,9 @@ define i32 @sadd(i32 %arg) {
 ;
 ; SLM-LABEL: 'sadd'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.sadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.sadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.sadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 undef, i32 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.sadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.sadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
@ -409,9 +409,9 @@ define i32 @uadd(i32 %arg) {
 ;
 ; SLM-LABEL: 'uadd'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.uadd.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.uadd.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.uadd.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 undef, i32 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.uadd.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.uadd.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
@ -647,9 +647,9 @@ define i32 @ssub(i32 %arg) {
 ;
 ; SLM-LABEL: 'ssub'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.ssub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.ssub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.ssub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 undef, i32 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.ssub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.ssub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
@ -885,9 +885,9 @@ define i32 @usub(i32 %arg) {
 ;
 ; SLM-LABEL: 'usub'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.usub.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.usub.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.usub.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 undef, i32 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.usub.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.usub.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
@ -1123,9 +1123,9 @@ define i32 @smul(i32 %arg) {
 ;
 ; SLM-LABEL: 'smul'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %I64 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.smul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.smul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 104 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.smul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call { i32, i1 } @llvm.smul.with.overflow.i32(i32 undef, i32 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.smul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 98 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.smul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
@ -1361,9 +1361,9 @@ define i32 @umul(i32 %arg) {
 ;
 ; SLM-LABEL: 'umul'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %I64 = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call { <2 x i64>, <2 x i1> } @llvm.umul.with.overflow.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 42 for instruction: %V4I64 = call { <4 x i64>, <4 x i1> } @llvm.umul.with.overflow.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %V8I64 = call { <8 x i64>, <8 x i1> } @llvm.umul.with.overflow.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 undef, i32 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 46 for instruction: %V4I32 = call { <4 x i32>, <4 x i1> } @llvm.umul.with.overflow.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %V8I32 = call { <8 x i32>, <8 x i1> } @llvm.umul.with.overflow.v8i32(<8 x i32> undef, <8 x i32> undef)
--- a/test/Analysis/CostModel/X86/arith-ssat.ll
+++ b/test/Analysis/CostModel/X86/arith-ssat.ll
@ -170,9 +170,9 @@ define i32 @add(i32 %arg) {
 ;
 ; SLM-LABEL: 'add'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.sadd.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4I64 = call <4 x i64> @llvm.sadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8I64 = call <8 x i64> @llvm.sadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.sadd.sat.i32(i32 undef, i32 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.sadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.sadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
@ -404,9 +404,9 @@ define i32 @sub(i32 %arg) {
 ;
 ; SLM-LABEL: 'sub'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.ssub.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V2I64 = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V4I64 = call <4 x i64> @llvm.ssub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 59 for instruction: %V8I64 = call <8 x i64> @llvm.ssub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.ssub.sat.i32(i32 undef, i32 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I32 = call <4 x i32> @llvm.ssub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V8I32 = call <8 x i32> @llvm.ssub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
--- a/test/Analysis/CostModel/X86/arith-usat.ll
+++ b/test/Analysis/CostModel/X86/arith-usat.ll
@ -170,9 +170,9 @@ define i32 @add(i32 %arg) {
 ;
 ; SLM-LABEL: 'add'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.uadd.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.uadd.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.uadd.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.uadd.sat.i32(i32 undef, i32 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = call <4 x i32> @llvm.uadd.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = call <8 x i32> @llvm.uadd.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
@ -404,9 +404,9 @@ define i32 @sub(i32 %arg) {
 ;
 ; SLM-LABEL: 'sub'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I64 = call i64 @llvm.usub.sat.i64(i64 undef, i64 undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2I64 = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> undef, <2 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %V4I64 = call <4 x i64> @llvm.usub.sat.v4i64(<4 x i64> undef, <4 x i64> undef)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8I64 = call <8 x i64> @llvm.usub.sat.v8i64(<8 x i64> undef, <8 x i64> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %I32 = call i32 @llvm.usub.sat.i32(i32 undef, i32 undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> undef, <4 x i32> undef)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <8 x i32> @llvm.usub.sat.v8i32(<8 x i32> undef, <8 x i32> undef)
--- a/test/Analysis/CostModel/X86/fshl.ll
+++ b/test/Analysis/CostModel/X86/fshl.ll
@ -57,9 +57,9 @@ define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ;
 ; SLM-LABEL: 'var_funnel_i64'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 %c64)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; GLM-LABEL: 'var_funnel_i64'
@ -385,9 +385,9 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
-; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; GLM-LABEL: 'splatvar_funnel_i64'
@ -777,9 +777,9 @@ define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; SLM-LABEL: 'constant_funnel_i64'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; GLM-LABEL: 'constant_funnel_i64'
@ -1093,9 +1093,9 @@ define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256
 ;
 ; SLM-LABEL: 'splatconstant_funnel_i64'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshl.i64(i64 %a64, i64 %b64, i64 7)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshl.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshl.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; GLM-LABEL: 'splatconstant_funnel_i64'
--- a/test/Analysis/CostModel/X86/fshr.ll
+++ b/test/Analysis/CostModel/X86/fshr.ll
@ -57,9 +57,9 @@ define void @var_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64
 ;
 ; SLM-LABEL: 'var_funnel_i64'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 %c64)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 17 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %c128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %c256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 68 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %c512)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; GLM-LABEL: 'var_funnel_i64'
@ -385,9 +385,9 @@ define void @splatvar_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u128 = shufflevector <2 x i64> %c128, <2 x i64> undef, <2 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u256 = shufflevector <4 x i64> %c256, <4 x i64> undef, <4 x i32> zeroinitializer
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %u512 = shufflevector <8 x i64> %c512, <8 x i64> undef, <8 x i32> zeroinitializer
-; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> %u128)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> %u256)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> %u512)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; GLM-LABEL: 'splatvar_funnel_i64'
@ -777,9 +777,9 @@ define void @constant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 ;
 ; SLM-LABEL: 'constant_funnel_i64'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 30 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 60 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 1, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 1, i64 7, i64 15, i64 31>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 1, i64 7, i64 15, i64 31, i64 1, i64 7, i64 15, i64 31>)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; GLM-LABEL: 'constant_funnel_i64'
@ -1093,9 +1093,9 @@ define void @splatconstant_funnel_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256
 ;
 ; SLM-LABEL: 'splatconstant_funnel_i64'
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.fshr.i64(i64 %a64, i64 %b64, i64 7)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
-; SLM-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V2I64 = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a128, <2 x i64> %b128, <2 x i64> <i64 7, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V4I64 = call <4 x i64> @llvm.fshr.v4i64(<4 x i64> %a256, <4 x i64> %b256, <4 x i64> <i64 7, i64 7, i64 7, i64 7>)
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V8I64 = call <8 x i64> @llvm.fshr.v8i64(<8 x i64> %a512, <8 x i64> %b512, <8 x i64> <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>)
 ; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; GLM-LABEL: 'splatconstant_funnel_i64'
--- a/test/Analysis/CostModel/X86/icmp.ll
+++ b/test/Analysis/CostModel/X86/icmp.ll
@ -11,7 +11,7 @@
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mattr=+xop,+avx | FileCheck %s -check-prefixes=CHECK,AVX,XOPAVX1
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mattr=+xop,+avx2 | FileCheck %s -check-prefixes=CHECK,AVX,XOPAVX2
 ;
-; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
+; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=slm | FileCheck %s --check-prefixes=CHECK,SSE,SLM
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=goldmont | FileCheck %s --check-prefixes=CHECK,SSE,SSE42
 ; RUN: opt < %s -mtriple=x86_64-apple-macosx10.8.0 -cost-model -analyze -mcpu=btver2 | FileCheck %s --check-prefixes=BTVER2

@ -269,6 +269,29 @@ define i32 @cmp_int_eq(i32 %arg) {
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'cmp_int_eq'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp eq <32 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp eq <64 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp eq <128 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp eq i16 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp eq <8 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp eq <16 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp eq <32 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp eq <64 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp eq i32 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp eq <4 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp eq <8 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp eq <16 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp eq <32 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp eq i64 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp eq <2 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp eq <4 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp eq <8 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp eq <16 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'cmp_int_eq'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp eq i8 undef, undef
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp eq <16 x i8> undef, undef
@ -573,6 +596,29 @@ define i32 @cmp_int_ne(i32 %arg) {
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'cmp_int_ne'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ne <32 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ne <64 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ne <128 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ne i16 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ne <8 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ne <16 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ne <32 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ne <64 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ne i32 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ne <4 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ne <8 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ne <16 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ne <32 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ne i64 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp ne <2 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp ne <4 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp ne <8 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp ne <16 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'cmp_int_ne'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ne i8 undef, undef
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ne <16 x i8> undef, undef
@ -877,6 +923,29 @@ define i32 @cmp_int_sge(i32 %arg) {
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'cmp_int_sge'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sge <32 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sge <64 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sge <128 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sge i16 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sge <8 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sge <16 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sge <32 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sge <64 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sge i32 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sge <4 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sge <8 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sge <16 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sge <32 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sge i64 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sge <2 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sge <4 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sge <8 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sge <16 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'cmp_int_sge'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sge i8 undef, undef
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sge <16 x i8> undef, undef
@ -1181,6 +1250,29 @@ define i32 @cmp_int_uge(i32 %arg) {
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'cmp_int_uge'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp uge <32 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp uge <64 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp uge <128 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp uge i16 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp uge <8 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp uge <16 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp uge <32 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp uge <64 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp uge i32 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp uge <4 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp uge <8 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp uge <16 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp uge <32 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp uge i64 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp uge <2 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp uge <4 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp uge <8 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp uge <16 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'cmp_int_uge'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp uge i8 undef, undef
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp uge <16 x i8> undef, undef
@ -1485,6 +1577,29 @@ define i32 @cmp_int_sgt(i32 %arg) {
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'cmp_int_sgt'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp sgt <32 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp sgt <64 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp sgt <128 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sgt i16 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp sgt <8 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp sgt <16 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp sgt <32 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp sgt <64 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sgt i32 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp sgt <4 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp sgt <8 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp sgt <16 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp sgt <32 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sgt i64 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp sgt <2 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp sgt <4 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp sgt <8 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp sgt <16 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'cmp_int_sgt'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sgt i8 undef, undef
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp sgt <16 x i8> undef, undef
@ -1789,6 +1904,29 @@ define i32 @cmp_int_ugt(i32 %arg) {
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'cmp_int_ugt'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ugt <32 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ugt <64 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ugt <128 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ugt i16 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ugt <8 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ugt <16 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ugt <32 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ugt <64 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ugt i32 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ugt <4 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ugt <8 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ugt <16 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ugt <32 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ugt i64 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ugt <2 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ugt <4 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ugt <8 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ugt <16 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'cmp_int_ugt'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ugt i8 undef, undef
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ugt <16 x i8> undef, undef
@ -2093,6 +2231,29 @@ define i32 @cmp_int_sle(i32 %arg) {
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'cmp_int_sle'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp sle <32 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp sle <64 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp sle <128 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp sle i16 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp sle <8 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp sle <16 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp sle <32 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp sle <64 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp sle i32 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp sle <4 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp sle <8 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp sle <16 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp sle <32 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp sle i64 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2I64 = icmp sle <2 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4I64 = icmp sle <4 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V8I64 = icmp sle <8 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V16I64 = icmp sle <16 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'cmp_int_sle'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp sle i8 undef, undef
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp sle <16 x i8> undef, undef
@ -2397,6 +2558,29 @@ define i32 @cmp_int_ule(i32 %arg) {
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'cmp_int_ule'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I8 = icmp ule <32 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I8 = icmp ule <64 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V128I8 = icmp ule <128 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ule i16 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I16 = icmp ule <8 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I16 = icmp ule <16 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I16 = icmp ule <32 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V64I16 = icmp ule <64 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ule i32 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = icmp ule <4 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = icmp ule <8 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V16I32 = icmp ule <16 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V32I32 = icmp ule <32 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ule i64 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = icmp ule <2 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = icmp ule <4 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = icmp ule <8 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %V16I64 = icmp ule <16 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'cmp_int_ule'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ule i8 undef, undef
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I8 = icmp ule <16 x i8> undef, undef
@ -2701,6 +2885,29 @@ define i32 @cmp_int_slt(i32 %arg) {
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'cmp_int_slt'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = icmp slt <32 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = icmp slt <64 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V128I8 = icmp slt <128 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp slt i16 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = icmp slt <8 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = icmp slt <16 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = icmp slt <32 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V64I16 = icmp slt <64 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp slt i32 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = icmp slt <4 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8I32 = icmp slt <8 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16I32 = icmp slt <16 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V32I32 = icmp slt <32 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp slt i64 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2I64 = icmp slt <2 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V4I64 = icmp slt <4 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V8I64 = icmp slt <8 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V16I64 = icmp slt <16 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'cmp_int_slt'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp slt i8 undef, undef
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = icmp slt <16 x i8> undef, undef
@ -3005,6 +3212,29 @@ define i32 @cmp_int_ult(i32 %arg) {
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef
 ; XOPAVX2-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
+; SLM-LABEL: 'cmp_int_ult'
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = icmp ult <32 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = icmp ult <64 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V128I8 = icmp ult <128 x i8> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I16 = icmp ult i16 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = icmp ult <8 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = icmp ult <16 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = icmp ult <32 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V64I16 = icmp ult <64 x i16> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I32 = icmp ult i32 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4I32 = icmp ult <4 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V8I32 = icmp ult <8 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V16I32 = icmp ult <16 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V32I32 = icmp ult <32 x i32> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I64 = icmp ult i64 undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I64 = icmp ult <2 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I64 = icmp ult <4 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I64 = icmp ult <8 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %V16I64 = icmp ult <16 x i64> undef, undef
+; SLM-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
 ; BTVER2-LABEL: 'cmp_int_ult'
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %I8 = icmp ult i8 undef, undef
 ; BTVER2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = icmp ult <16 x i8> undef, undef
--- a/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
+++ b/test/Transforms/SLPVectorizer/X86/arith-add-usat.ll
@ -62,22 +62,38 @@ define void @add_v8i64() {
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @add_v8i64(
-; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.uadd.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
-; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SLM-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SLM-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SLM-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SLM-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SLM-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SLM-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SLM-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SLM-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[R0:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A0]], i64 [[B0]])
+; SLM-NEXT:    [[R1:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A1]], i64 [[B1]])
+; SLM-NEXT:    [[R2:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A2]], i64 [[B2]])
+; SLM-NEXT:    [[R3:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A3]], i64 [[B3]])
+; SLM-NEXT:    [[R4:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A4]], i64 [[B4]])
+; SLM-NEXT:    [[R5:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A5]], i64 [[B5]])
+; SLM-NEXT:    [[R6:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A6]], i64 [[B6]])
+; SLM-NEXT:    [[R7:%.*]] = call i64 @llvm.uadd.sat.i64(i64 [[A7]], i64 [[B7]])
+; SLM-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SLM-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SLM-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SLM-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SLM-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SLM-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SLM-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SLM-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @add_v8i64(
--- a/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
+++ b/test/Transforms/SLPVectorizer/X86/arith-sub-usat.ll
@ -62,22 +62,38 @@ define void @sub_v8i64() {
 ; SSE-NEXT:    ret void
 ;
 ; SLM-LABEL: @sub_v8i64(
-; SLM-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
-; SLM-NEXT:    [[TMP9:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP1]], <2 x i64> [[TMP5]])
-; SLM-NEXT:    [[TMP10:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP2]], <2 x i64> [[TMP6]])
-; SLM-NEXT:    [[TMP11:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP3]], <2 x i64> [[TMP7]])
-; SLM-NEXT:    [[TMP12:%.*]] = call <2 x i64> @llvm.usub.sat.v2i64(<2 x i64> [[TMP4]], <2 x i64> [[TMP8]])
-; SLM-NEXT:    store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
-; SLM-NEXT:    store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
+; SLM-NEXT:    [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
+; SLM-NEXT:    [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
+; SLM-NEXT:    [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
+; SLM-NEXT:    [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
+; SLM-NEXT:    [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
+; SLM-NEXT:    [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
+; SLM-NEXT:    [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
+; SLM-NEXT:    [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
+; SLM-NEXT:    [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
+; SLM-NEXT:    [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
+; SLM-NEXT:    [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
+; SLM-NEXT:    [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
+; SLM-NEXT:    [[R0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A0]], i64 [[B0]])
+; SLM-NEXT:    [[R1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A1]], i64 [[B1]])
+; SLM-NEXT:    [[R2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A2]], i64 [[B2]])
+; SLM-NEXT:    [[R3:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A3]], i64 [[B3]])
+; SLM-NEXT:    [[R4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A4]], i64 [[B4]])
+; SLM-NEXT:    [[R5:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A5]], i64 [[B5]])
+; SLM-NEXT:    [[R6:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A6]], i64 [[B6]])
+; SLM-NEXT:    [[R7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[A7]], i64 [[B7]])
+; SLM-NEXT:    store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
+; SLM-NEXT:    store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
+; SLM-NEXT:    store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
+; SLM-NEXT:    store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
+; SLM-NEXT:    store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
+; SLM-NEXT:    store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
+; SLM-NEXT:    store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
+; SLM-NEXT:    store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
 ; SLM-NEXT:    ret void
 ;
 ; AVX-LABEL: @sub_v8i64(