AMDGPU: Don't create weird sized integers

There's no reason to introduce a new, unnaturally sized value here. This has a chance to produce worse code with legalization. Avoids regression in a future patch.
2024-11-23 03:02:36 +01:00 · 2020-01-18 20:57:22 -05:00 · 2020-01-18 20:57:22 -05:00 · 87ac35b140
commit 87ac35b140
parent ddf5bbdb8a
2 changed files with 55 additions and 55 deletions
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@ -725,10 +725,10 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
    Res = Builder.CreateSub(Num, Rem);
  }

-  // Truncate to number of bits this divide really is.
+  // Extend in register from the number of bits this divide really is.
  if (IsSigned) {
-    Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits));
-    Res = Builder.CreateSExt(Res, Ty);
+    Res = Builder.CreateShl(Res, 32 - DivBits);
+    Res = Builder.CreateAShr(Res, 32 - DivBits);
  } else {
    ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
    Res = Builder.CreateAnd(Res, TruncMask);
--- a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@ -294,8 +294,8 @@ define amdgpu_kernel void @sdiv_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i16
-; CHECK-NEXT:    [[TMP20:%.*]] = sext i16 [[TMP19]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 16
+; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 16
 ; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
 ; CHECK-NEXT:    store i16 [[TMP21]], i16 addrspace(1)* [[OUT:%.*]]
 ; CHECK-NEXT:    ret void
@ -327,8 +327,8 @@ define amdgpu_kernel void @srem_i16(i16 addrspace(1)* %out, i16 %x, i16 %y) {
 ; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
-; CHECK-NEXT:    [[TMP22:%.*]] = sext i16 [[TMP21]] to i32
+; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
+; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
 ; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
 ; CHECK-NEXT:    store i16 [[TMP23]], i16 addrspace(1)* [[OUT:%.*]]
 ; CHECK-NEXT:    ret void
@ -414,8 +414,8 @@ define amdgpu_kernel void @sdiv_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i8
-; CHECK-NEXT:    [[TMP20:%.*]] = sext i8 [[TMP19]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 24
+; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 24
 ; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8
 ; CHECK-NEXT:    store i8 [[TMP21]], i8 addrspace(1)* [[OUT:%.*]]
 ; CHECK-NEXT:    ret void
@ -447,8 +447,8 @@ define amdgpu_kernel void @srem_i8(i8 addrspace(1)* %out, i8 %x, i8 %y) {
 ; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i8
-; CHECK-NEXT:    [[TMP22:%.*]] = sext i8 [[TMP21]] to i32
+; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 24
+; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 24
 ; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i8
 ; CHECK-NEXT:    store i8 [[TMP23]], i8 addrspace(1)* [[OUT:%.*]]
 ; CHECK-NEXT:    ret void
@ -1464,8 +1464,8 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
-; CHECK-NEXT:    [[TMP22:%.*]] = sext i16 [[TMP21]] to i32
+; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
+; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
 ; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
 ; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <4 x i16> undef, i16 [[TMP23]], i64 0
 ; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <4 x i16> [[X]], i64 1
@ -1488,8 +1488,8 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
 ; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
-; CHECK-NEXT:    [[TMP45:%.*]] = trunc i32 [[TMP44]] to i16
-; CHECK-NEXT:    [[TMP46:%.*]] = sext i16 [[TMP45]] to i32
+; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 16
+; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
 ; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
 ; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <4 x i16> [[TMP24]], i16 [[TMP47]], i64 1
 ; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <4 x i16> [[X]], i64 2
@ -1512,8 +1512,8 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
 ; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
 ; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
-; CHECK-NEXT:    [[TMP69:%.*]] = trunc i32 [[TMP68]] to i16
-; CHECK-NEXT:    [[TMP70:%.*]] = sext i16 [[TMP69]] to i32
+; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 16
+; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
 ; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
 ; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <4 x i16> [[TMP48]], i16 [[TMP71]], i64 2
 ; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <4 x i16> [[X]], i64 3
@ -1536,8 +1536,8 @@ define amdgpu_kernel void @sdiv_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; CHECK-NEXT:    [[TMP90:%.*]] = fcmp fast oge float [[TMP88]], [[TMP89]]
 ; CHECK-NEXT:    [[TMP91:%.*]] = select i1 [[TMP90]], i32 [[TMP79]], i32 0
 ; CHECK-NEXT:    [[TMP92:%.*]] = add i32 [[TMP87]], [[TMP91]]
-; CHECK-NEXT:    [[TMP93:%.*]] = trunc i32 [[TMP92]] to i16
-; CHECK-NEXT:    [[TMP94:%.*]] = sext i16 [[TMP93]] to i32
+; CHECK-NEXT:    [[TMP93:%.*]] = shl i32 [[TMP92]], 16
+; CHECK-NEXT:    [[TMP94:%.*]] = ashr i32 [[TMP93]], 16
 ; CHECK-NEXT:    [[TMP95:%.*]] = trunc i32 [[TMP94]] to i16
 ; CHECK-NEXT:    [[TMP96:%.*]] = insertelement <4 x i16> [[TMP72]], i16 [[TMP95]], i64 3
 ; CHECK-NEXT:    store <4 x i16> [[TMP96]], <4 x i16> addrspace(1)* [[OUT:%.*]]
@ -1572,8 +1572,8 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
-; CHECK-NEXT:    [[TMP24:%.*]] = sext i16 [[TMP23]] to i32
+; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 16
+; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
 ; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
 ; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <4 x i16> undef, i16 [[TMP25]], i64 0
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <4 x i16> [[X]], i64 1
@ -1598,8 +1598,8 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
 ; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
 ; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
-; CHECK-NEXT:    [[TMP49:%.*]] = trunc i32 [[TMP48]] to i16
-; CHECK-NEXT:    [[TMP50:%.*]] = sext i16 [[TMP49]] to i32
+; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 16
+; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
 ; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP51]], i64 1
 ; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <4 x i16> [[X]], i64 2
@ -1624,8 +1624,8 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
 ; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
 ; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
-; CHECK-NEXT:    [[TMP75:%.*]] = trunc i32 [[TMP74]] to i16
-; CHECK-NEXT:    [[TMP76:%.*]] = sext i16 [[TMP75]] to i32
+; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 16
+; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
 ; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
 ; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <4 x i16> [[TMP52]], i16 [[TMP77]], i64 2
 ; CHECK-NEXT:    [[TMP79:%.*]] = extractelement <4 x i16> [[X]], i64 3
@ -1650,8 +1650,8 @@ define amdgpu_kernel void @srem_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %x
 ; CHECK-NEXT:    [[TMP98:%.*]] = add i32 [[TMP93]], [[TMP97]]
 ; CHECK-NEXT:    [[TMP99:%.*]] = mul i32 [[TMP98]], [[TMP82]]
 ; CHECK-NEXT:    [[TMP100:%.*]] = sub i32 [[TMP81]], [[TMP99]]
-; CHECK-NEXT:    [[TMP101:%.*]] = trunc i32 [[TMP100]] to i16
-; CHECK-NEXT:    [[TMP102:%.*]] = sext i16 [[TMP101]] to i32
+; CHECK-NEXT:    [[TMP101:%.*]] = shl i32 [[TMP100]], 16
+; CHECK-NEXT:    [[TMP102:%.*]] = ashr i32 [[TMP101]], 16
 ; CHECK-NEXT:    [[TMP103:%.*]] = trunc i32 [[TMP102]] to i16
 ; CHECK-NEXT:    [[TMP104:%.*]] = insertelement <4 x i16> [[TMP78]], i16 [[TMP103]], i64 3
 ; CHECK-NEXT:    store <4 x i16> [[TMP104]], <4 x i16> addrspace(1)* [[OUT:%.*]]
@ -1738,8 +1738,8 @@ define amdgpu_kernel void @sdiv_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; CHECK-NEXT:    [[TMP16:%.*]] = fcmp fast oge float [[TMP14]], [[TMP15]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = select i1 [[TMP16]], i32 [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
-; CHECK-NEXT:    [[TMP19:%.*]] = trunc i32 [[TMP18]] to i3
-; CHECK-NEXT:    [[TMP20:%.*]] = sext i3 [[TMP19]] to i32
+; CHECK-NEXT:    [[TMP19:%.*]] = shl i32 [[TMP18]], 29
+; CHECK-NEXT:    [[TMP20:%.*]] = ashr i32 [[TMP19]], 29
 ; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3
 ; CHECK-NEXT:    store i3 [[TMP21]], i3 addrspace(1)* [[OUT:%.*]]
 ; CHECK-NEXT:    ret void
@ -1771,8 +1771,8 @@ define amdgpu_kernel void @srem_i3(i3 addrspace(1)* %out, i3 %x, i3 %y) {
 ; CHECK-NEXT:    [[TMP18:%.*]] = add i32 [[TMP13]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = mul i32 [[TMP18]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = sub i32 [[TMP1]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i3
-; CHECK-NEXT:    [[TMP22:%.*]] = sext i3 [[TMP21]] to i32
+; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 29
+; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 29
 ; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i3
 ; CHECK-NEXT:    store i3 [[TMP23]], i3 addrspace(1)* [[OUT:%.*]]
 ; CHECK-NEXT:    ret void
@ -1950,8 +1950,8 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i16
-; CHECK-NEXT:    [[TMP22:%.*]] = sext i16 [[TMP21]] to i32
+; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 16
+; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 16
 ; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
 ; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x i16> undef, i16 [[TMP23]], i64 0
 ; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x i16> [[X]], i64 1
@ -1974,8 +1974,8 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
 ; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
-; CHECK-NEXT:    [[TMP45:%.*]] = trunc i32 [[TMP44]] to i16
-; CHECK-NEXT:    [[TMP46:%.*]] = sext i16 [[TMP45]] to i32
+; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 16
+; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 16
 ; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i16
 ; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <3 x i16> [[TMP24]], i16 [[TMP47]], i64 1
 ; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i16> [[X]], i64 2
@ -1998,8 +1998,8 @@ define amdgpu_kernel void @sdiv_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
 ; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
 ; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
-; CHECK-NEXT:    [[TMP69:%.*]] = trunc i32 [[TMP68]] to i16
-; CHECK-NEXT:    [[TMP70:%.*]] = sext i16 [[TMP69]] to i32
+; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 16
+; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 16
 ; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i16
 ; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i16> [[TMP48]], i16 [[TMP71]], i64 2
 ; CHECK-NEXT:    store <3 x i16> [[TMP72]], <3 x i16> addrspace(1)* [[OUT:%.*]]
@ -2034,8 +2034,8 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i16
-; CHECK-NEXT:    [[TMP24:%.*]] = sext i16 [[TMP23]] to i32
+; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 16
+; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 16
 ; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i16
 ; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x i16> undef, i16 [[TMP25]], i64 0
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x i16> [[X]], i64 1
@ -2060,8 +2060,8 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
 ; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
 ; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
-; CHECK-NEXT:    [[TMP49:%.*]] = trunc i32 [[TMP48]] to i16
-; CHECK-NEXT:    [[TMP50:%.*]] = sext i16 [[TMP49]] to i32
+; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 16
+; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 16
 ; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i16
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <3 x i16> [[TMP26]], i16 [[TMP51]], i64 1
 ; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i16> [[X]], i64 2
@ -2086,8 +2086,8 @@ define amdgpu_kernel void @srem_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %x
 ; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
 ; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
 ; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
-; CHECK-NEXT:    [[TMP75:%.*]] = trunc i32 [[TMP74]] to i16
-; CHECK-NEXT:    [[TMP76:%.*]] = sext i16 [[TMP75]] to i32
+; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 16
+; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 16
 ; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i16
 ; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i16> [[TMP52]], i16 [[TMP77]], i64 2
 ; CHECK-NEXT:    store <3 x i16> [[TMP78]], <3 x i16> addrspace(1)* [[OUT:%.*]]
@ -2266,8 +2266,8 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; CHECK-NEXT:    [[TMP18:%.*]] = fcmp fast oge float [[TMP16]], [[TMP17]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = select i1 [[TMP18]], i32 [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
-; CHECK-NEXT:    [[TMP21:%.*]] = trunc i32 [[TMP20]] to i15
-; CHECK-NEXT:    [[TMP22:%.*]] = sext i15 [[TMP21]] to i32
+; CHECK-NEXT:    [[TMP21:%.*]] = shl i32 [[TMP20]], 17
+; CHECK-NEXT:    [[TMP22:%.*]] = ashr i32 [[TMP21]], 17
 ; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15
 ; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x i15> undef, i15 [[TMP23]], i64 0
 ; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x i15> [[X]], i64 1
@ -2290,8 +2290,8 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; CHECK-NEXT:    [[TMP42:%.*]] = fcmp fast oge float [[TMP40]], [[TMP41]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = select i1 [[TMP42]], i32 [[TMP31]], i32 0
 ; CHECK-NEXT:    [[TMP44:%.*]] = add i32 [[TMP39]], [[TMP43]]
-; CHECK-NEXT:    [[TMP45:%.*]] = trunc i32 [[TMP44]] to i15
-; CHECK-NEXT:    [[TMP46:%.*]] = sext i15 [[TMP45]] to i32
+; CHECK-NEXT:    [[TMP45:%.*]] = shl i32 [[TMP44]], 17
+; CHECK-NEXT:    [[TMP46:%.*]] = ashr i32 [[TMP45]], 17
 ; CHECK-NEXT:    [[TMP47:%.*]] = trunc i32 [[TMP46]] to i15
 ; CHECK-NEXT:    [[TMP48:%.*]] = insertelement <3 x i15> [[TMP24]], i15 [[TMP47]], i64 1
 ; CHECK-NEXT:    [[TMP49:%.*]] = extractelement <3 x i15> [[X]], i64 2
@ -2314,8 +2314,8 @@ define amdgpu_kernel void @sdiv_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; CHECK-NEXT:    [[TMP66:%.*]] = fcmp fast oge float [[TMP64]], [[TMP65]]
 ; CHECK-NEXT:    [[TMP67:%.*]] = select i1 [[TMP66]], i32 [[TMP55]], i32 0
 ; CHECK-NEXT:    [[TMP68:%.*]] = add i32 [[TMP63]], [[TMP67]]
-; CHECK-NEXT:    [[TMP69:%.*]] = trunc i32 [[TMP68]] to i15
-; CHECK-NEXT:    [[TMP70:%.*]] = sext i15 [[TMP69]] to i32
+; CHECK-NEXT:    [[TMP69:%.*]] = shl i32 [[TMP68]], 17
+; CHECK-NEXT:    [[TMP70:%.*]] = ashr i32 [[TMP69]], 17
 ; CHECK-NEXT:    [[TMP71:%.*]] = trunc i32 [[TMP70]] to i15
 ; CHECK-NEXT:    [[TMP72:%.*]] = insertelement <3 x i15> [[TMP48]], i15 [[TMP71]], i64 2
 ; CHECK-NEXT:    store <3 x i15> [[TMP72]], <3 x i15> addrspace(1)* [[OUT:%.*]]
@ -2350,8 +2350,8 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[TMP15]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = mul i32 [[TMP20]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = sub i32 [[TMP3]], [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = trunc i32 [[TMP22]] to i15
-; CHECK-NEXT:    [[TMP24:%.*]] = sext i15 [[TMP23]] to i32
+; CHECK-NEXT:    [[TMP23:%.*]] = shl i32 [[TMP22]], 17
+; CHECK-NEXT:    [[TMP24:%.*]] = ashr i32 [[TMP23]], 17
 ; CHECK-NEXT:    [[TMP25:%.*]] = trunc i32 [[TMP24]] to i15
 ; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x i15> undef, i15 [[TMP25]], i64 0
 ; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x i15> [[X]], i64 1
@ -2376,8 +2376,8 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; CHECK-NEXT:    [[TMP46:%.*]] = add i32 [[TMP41]], [[TMP45]]
 ; CHECK-NEXT:    [[TMP47:%.*]] = mul i32 [[TMP46]], [[TMP30]]
 ; CHECK-NEXT:    [[TMP48:%.*]] = sub i32 [[TMP29]], [[TMP47]]
-; CHECK-NEXT:    [[TMP49:%.*]] = trunc i32 [[TMP48]] to i15
-; CHECK-NEXT:    [[TMP50:%.*]] = sext i15 [[TMP49]] to i32
+; CHECK-NEXT:    [[TMP49:%.*]] = shl i32 [[TMP48]], 17
+; CHECK-NEXT:    [[TMP50:%.*]] = ashr i32 [[TMP49]], 17
 ; CHECK-NEXT:    [[TMP51:%.*]] = trunc i32 [[TMP50]] to i15
 ; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <3 x i15> [[TMP26]], i15 [[TMP51]], i64 1
 ; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x i15> [[X]], i64 2
@ -2402,8 +2402,8 @@ define amdgpu_kernel void @srem_v3i15(<3 x i15> addrspace(1)* %out, <3 x i15> %x
 ; CHECK-NEXT:    [[TMP72:%.*]] = add i32 [[TMP67]], [[TMP71]]
 ; CHECK-NEXT:    [[TMP73:%.*]] = mul i32 [[TMP72]], [[TMP56]]
 ; CHECK-NEXT:    [[TMP74:%.*]] = sub i32 [[TMP55]], [[TMP73]]
-; CHECK-NEXT:    [[TMP75:%.*]] = trunc i32 [[TMP74]] to i15
-; CHECK-NEXT:    [[TMP76:%.*]] = sext i15 [[TMP75]] to i32
+; CHECK-NEXT:    [[TMP75:%.*]] = shl i32 [[TMP74]], 17
+; CHECK-NEXT:    [[TMP76:%.*]] = ashr i32 [[TMP75]], 17
 ; CHECK-NEXT:    [[TMP77:%.*]] = trunc i32 [[TMP76]] to i15
 ; CHECK-NEXT:    [[TMP78:%.*]] = insertelement <3 x i15> [[TMP52]], i15 [[TMP77]], i64 2
 ; CHECK-NEXT:    store <3 x i15> [[TMP78]], <3 x i15> addrspace(1)* [[OUT:%.*]]