AMDGPU: Remove some uses of llvm.SI.export in tests

Merge some of the old, smaller tests into more complete versions. llvm-svn: 295792
2024-10-19 11:02:59 +02:00 · 2017-02-22 00:02:21 +00:00 · 2017-02-22 00:02:21 +00:00 · 3320e649a3
commit 3320e649a3
parent 65d8dccee7
33 changed files with 952 additions and 1065 deletions
--- a/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
+++ b/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
@ -3,19 +3,15 @@

 ; This test just checks that the compiler doesn't crash.

-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
 ; FUNC-LABEL: {{^}}v32i8_to_v8i32:
-; SI: s_endpgm
-define amdgpu_ps void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
+define amdgpu_ps float @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
 entry:
  %1 = load <32 x i8>, <32 x i8> addrspace(2)* %0
  %2 = bitcast <32 x i8> %1 to <8 x i32>
  %3 = extractelement <8 x i32> %2, i32 1
  %4 = icmp ne i32 %3, 0
  %5 = select i1 %4, float 0.0, float 1.0
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5)
-  ret void
+  ret float %5
 }

 ; FUNC-LABEL: {{^}}i8ptr_v16i8ptr:
--- a/test/CodeGen/AMDGPU/commute-shifts.ll
+++ b/test/CodeGen/AMDGPU/commute-shifts.ll
@ -4,7 +4,7 @@
 ; GCN-LABEL: {{^}}main:
 ; SI: v_lshl_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; VI: v_lshlrev_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 1
-define amdgpu_ps void @main(float %arg0, float %arg1) #0 {
+define amdgpu_ps float @main(float %arg0, float %arg1) #0 {
 bb:
  %tmp = fptosi float %arg0 to i32
  %tmp1 = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
@ -17,13 +17,11 @@ bb:
  %tmp7 = select i1 %tmp6, float 0.000000e+00, float %arg1
  %tmp8 = call i32 @llvm.SI.packf16(float undef, float %tmp7)
  %tmp9 = bitcast i32 %tmp8 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %tmp9, float undef, float %tmp9)
-  ret void
+  ret float %tmp9
 }

 declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
 declare i32 @llvm.SI.packf16(float, float) #1
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)

 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
--- a/test/CodeGen/AMDGPU/default-fp-mode.ll
+++ b/test/CodeGen/AMDGPU/default-fp-mode.ll
@ -97,18 +97,15 @@ main_body:

 ; GCN-LABEL: {{^}}kill_vcc_implicit_def:
 ; GCN: IeeeMode: 0
-define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) {
+define amdgpu_ps float @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) {
 entry:
  %tmp0 = fcmp olt float %13, 0.0
  call void @llvm.AMDGPU.kill(float %14)
  %tmp1 = select i1 %tmp0, float 1.0, float 0.0
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 1, i32 1, float %tmp1, float %tmp1, float %tmp1, float %tmp1)
-  ret void
+  ret float %tmp1
 }

-
 declare void @llvm.AMDGPU.kill(float)
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)

 attributes #0 = { nounwind "target-cpu"="tahiti" }
 attributes #1 = { nounwind "target-cpu"="fiji" }
--- a/test/CodeGen/AMDGPU/elf.ll
+++ b/test/CodeGen/AMDGPU/elf.ll
@ -24,11 +24,13 @@
 ; TONGA-NEXT: .long   704
 ; CONFIG: .p2align 8
 ; CONFIG: test:
-define amdgpu_ps void @test(i32 %p) {
+define amdgpu_ps void @test(i32 %p) #0 {
   %i = add i32 %p, 2
   %r = bitcast i32 %i to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
+   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r, float %r, float %r, float %r, i1 true, i1 false)
   ret void
 }

-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+
+attributes #0 = { nounwind }
--- a/test/CodeGen/AMDGPU/imm.ll
+++ b/test/CodeGen/AMDGPU/imm.ll
@ -667,3 +667,18 @@ define void @store_literal_imm_f64(double addrspace(1)* %out) {
  store double 4096.0, double addrspace(1)* %out
  ret void
 }
+
+; GCN-LABEL: {{^}}literal_folding:
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f4353f8, v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0xbf4353f8, v{{[0-9]+}}
+define amdgpu_vs void @literal_folding(float %arg) {
+main_body:
+  %tmp = fmul float %arg, 0x3FE86A7F00000000
+  %tmp1 = fmul float %arg, 0xBFE86A7F00000000
+  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %tmp, float %tmp, float %tmp1, float %tmp1, i1 true, i1 false) #0
+  ret void
+}
+
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+
+attributes #0 = { nounwind }
--- a/test/CodeGen/AMDGPU/insert-waits-exp.mir
+++ b/test/CodeGen/AMDGPU/insert-waits-exp.mir
@ -1,18 +1,18 @@
 # RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s
 --- |
-  define amdgpu_ps <4 x float> @exp_done_waitcnt(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) {
+  define amdgpu_ps <4 x float> @exp_done_waitcnt(<4 x i32> inreg, <4 x
+  i32> inreg, i32 inreg %w, float %v) #0 {
    %a = load volatile float, float addrspace(1)* undef
    %b = load volatile float, float addrspace(1)* undef
    %c = load volatile float, float addrspace(1)* undef
    %d = load volatile float, float addrspace(1)* undef
-    call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %a, float %b, float %c, float %d)
+    call void @llvm.amdgcn.exp.f32(i32 15, i32 1, float %a, float %b, float %c, float %d, i1 true, i1 false)
    ret <4 x float> <float 5.000000e-01, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>
  }

-  declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+  declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0

-  attributes #0 = { readnone }
-  attributes #1 = { nounwind }
+  attributes #0 = { nounwind }

 ...
 ---
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
@ -4,15 +4,14 @@
 ; SI-LABEL: {{^}}kill_gs_const:
 ; SI-NOT: v_cmpx_le_f32
 ; SI: s_mov_b64 exec, 0
-
 define amdgpu_gs void @kill_gs_const() {
 main_body:
-  %0 = icmp ule i32 0, 3
-  %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00
-  call void @llvm.AMDGPU.kill(float %1)
-  %2 = icmp ule i32 3, 0
-  %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00
-  call void @llvm.AMDGPU.kill(float %3)
+  %tmp = icmp ule i32 0, 3
+  %tmp1 = select i1 %tmp, float 1.000000e+00, float -1.000000e+00
+  call void @llvm.AMDGPU.kill(float %tmp1)
+  %tmp2 = icmp ule i32 3, 0
+  %tmp3 = select i1 %tmp2, float 1.000000e+00, float -1.000000e+00
+  call void @llvm.AMDGPU.kill(float %tmp3)
  ret void
 }

@ -21,16 +20,16 @@ main_body:
 ; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}}
 ; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}}
 ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
-define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) {
+define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) {
 entry:
-  %tmp0 = fcmp olt float %13, 0.0
-  call void @llvm.AMDGPU.kill(float %14)
-  %tmp1 = select i1 %tmp0, float 1.0, float 0.0
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 1, i32 1, float %tmp1, float %tmp1, float %tmp1, float %tmp1)
+  %tmp0 = fcmp olt float %arg13, 0.000000e+00
+  call void @llvm.AMDGPU.kill(float %arg14)
+  %tmp1 = select i1 %tmp0, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
  ret void
 }

-declare void @llvm.AMDGPU.kill(float)
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare void @llvm.AMDGPU.kill(float) #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0

-!0 = !{!"const", null, i32 1}
+attributes #0 = { nounwind }
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
@ -1,146 +1,144 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=CHECK,VI %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s

-;CHECK-LABEL: {{^}}image_load_v4i32:
-;CHECK: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm
-;CHECK: s_waitcnt vmcnt(0)
-define amdgpu_ps <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) {
+; GCN-LABEL: {{^}}image_load_v4i32:
+; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm
+; GCN: s_waitcnt vmcnt(0)
+define amdgpu_ps <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
 main_body:
-  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
  ret <4 x float> %tex
 }

-;CHECK-LABEL: {{^}}image_load_v2i32:
-;CHECK: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm
-;CHECK: s_waitcnt vmcnt(0)
-define amdgpu_ps <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) {
+; GCN-LABEL: {{^}}image_load_v2i32:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm
+; GCN: s_waitcnt vmcnt(0)
+define amdgpu_ps <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) #0 {
 main_body:
-  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
  ret <4 x float> %tex
 }

-;CHECK-LABEL: {{^}}image_load_i32:
-;CHECK: image_load v[0:3], v0, s[0:7] dmask:0xf unorm
-;CHECK: s_waitcnt vmcnt(0)
-define amdgpu_ps <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) {
+; GCN-LABEL: {{^}}image_load_i32:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm
+; GCN: s_waitcnt vmcnt(0)
+define amdgpu_ps <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) #0 {
 main_body:
-  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
  ret <4 x float> %tex
 }

-;CHECK-LABEL: {{^}}image_load_mip:
-;CHECK: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm
-;CHECK: s_waitcnt vmcnt(0)
-define amdgpu_ps <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) {
+; GCN-LABEL: {{^}}image_load_mip:
+; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm
+; GCN: s_waitcnt vmcnt(0)
+define amdgpu_ps <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
 main_body:
-  %tex = call <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
  ret <4 x float> %tex
 }

-;CHECK-LABEL: {{^}}image_load_1:
-;CHECK: image_load v0, v[0:3], s[0:7] dmask:0x1 unorm
-;CHECK: s_waitcnt vmcnt(0)
-define amdgpu_ps float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) {
+; GCN-LABEL: {{^}}image_load_1:
+; GCN: image_load v0, v[0:3], s[0:7] dmask:0x1 unorm
+; GCN: s_waitcnt vmcnt(0)
+define amdgpu_ps float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
 main_body:
-  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
  %elt = extractelement <4 x float> %tex, i32 0
-; Only first component used, test that dmask etc. is changed accordingly
  ret float %elt
 }

-;CHECK-LABEL: {{^}}image_load_f32_v2i32:
-;CHECK: image_load {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm
-;CHECK: s_waitcnt vmcnt(0)
-define amdgpu_ps float @image_load_f32_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) {
+; GCN-LABEL: {{^}}image_load_f32_v2i32:
+; GCN: image_load {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm
+; GCN: s_waitcnt vmcnt(0)
+define amdgpu_ps float @image_load_f32_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) #0 {
 main_body:
-  %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 1, i1 0, i1 0, i1 0, i1 0)
+  %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 1, i1 false, i1 false, i1 false, i1 false)
  ret float %tex
 }

-;CHECK-LABEL: {{^}}image_load_v2f32_v4i32:
-;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm
-;CHECK: s_waitcnt vmcnt(0)
-define amdgpu_ps <2 x float> @image_load_v2f32_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) {
+; GCN-LABEL: {{^}}image_load_v2f32_v4i32:
+; GCN: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm
+; GCN: s_waitcnt vmcnt(0)
+define amdgpu_ps <2 x float> @image_load_v2f32_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
 main_body:
-  %tex = call <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 3, i1 0, i1 0, i1 0, i1 0)
+  %tex = call <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 3, i1 false, i1 false, i1 false, i1 false)
  ret <2 x float> %tex
 }

-
-;CHECK-LABEL: {{^}}image_store_v4i32:
-;CHECK: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm
-define amdgpu_ps void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) {
+; GCN-LABEL: {{^}}image_store_v4i32:
+; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm
+define amdgpu_ps void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 {
 main_body:
-  call void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
  ret void
 }

-;CHECK-LABEL: {{^}}image_store_v2i32:
-;CHECK: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm
-define amdgpu_ps void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) {
+; GCN-LABEL: {{^}}image_store_v2i32:
+; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm
+define amdgpu_ps void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) #0 {
 main_body:
-  call void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float> %data, <2 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float> %data, <2 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
  ret void
 }

-;CHECK-LABEL: {{^}}image_store_i32:
-;CHECK: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
-define amdgpu_ps void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) {
+; GCN-LABEL: {{^}}image_store_i32:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
+define amdgpu_ps void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) #0 {
 main_body:
-  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
  ret void
 }

-;CHECK-LABEL: {{^}}image_store_f32_i32:
-;CHECK: image_store {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm
-define amdgpu_ps void @image_store_f32_i32(<8 x i32> inreg %rsrc, float %data, i32 %coords) {
+; GCN-LABEL: {{^}}image_store_f32_i32:
+; GCN: image_store {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm
+define amdgpu_ps void @image_store_f32_i32(<8 x i32> inreg %rsrc, float %data, i32 %coords) #0 {
 main_body:
-  call void @llvm.amdgcn.image.store.f32.i32.v8i32(float %data, i32 %coords, <8 x i32> %rsrc, i32 1, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.f32.i32.v8i32(float %data, i32 %coords, <8 x i32> %rsrc, i32 1, i1 false, i1 false, i1 false, i1 false)
  ret void
 }

-;CHECK-LABEL: {{^}}image_store_v2f32_v4i32:
-;CHECK: image_store {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm
-define amdgpu_ps void @image_store_v2f32_v4i32(<8 x i32> inreg %rsrc, <2 x float> %data, <4 x i32> %coords) {
+; GCN-LABEL: {{^}}image_store_v2f32_v4i32:
+; GCN: image_store {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm
+define amdgpu_ps void @image_store_v2f32_v4i32(<8 x i32> inreg %rsrc, <2 x float> %data, <4 x i32> %coords) #0 {
 main_body:
-  call void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 3, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 3, i1 false, i1 false, i1 false, i1 false)
  ret void
 }

-;CHECK-LABEL: {{^}}image_store_mip:
-;CHECK: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm
-define amdgpu_ps void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) {
+; GCN-LABEL: {{^}}image_store_mip:
+; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm
+define amdgpu_ps void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 {
 main_body:
-  call void @llvm.amdgcn.image.store.mip.v4f32.v4i32.v8i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.mip.v4f32.v4i32.v8i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
  ret void
 }

-;CHECK-LABEL: {{^}}getresinfo:
-;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @getresinfo() {
+; GCN-LABEL: {{^}}getresinfo:
+; GCN: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @getresinfo() #0 {
 main_body:
-  %r = call <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32 undef, <8 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0)
+  %r = call <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32 undef, <8 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false)
  %r0 = extractelement <4 x float> %r, i32 0
  %r1 = extractelement <4 x float> %r, i32 1
  %r2 = extractelement <4 x float> %r, i32 2
  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r0, float %r1, float %r2, float %r3, i1 true, i1 true) #0
  ret void
 }

 ; Ideally, the register allocator would avoid the wait here
 ;
-;CHECK-LABEL: {{^}}image_store_wait:
-;CHECK: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
-;CHECK: s_waitcnt vmcnt(0) expcnt(0)
-;CHECK: image_load v[0:3], v4, s[8:15] dmask:0xf unorm
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: image_store v[0:3], v4, s[16:23] dmask:0xf unorm
-define amdgpu_ps void @image_store_wait(<8 x i32> inreg, <8 x i32> inreg, <8 x i32> inreg, <4 x float>, i32) {
+; GCN-LABEL: {{^}}image_store_wait:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
+; GCN: s_waitcnt vmcnt(0) expcnt(0)
+; GCN: image_load v[0:3], v4, s[8:15] dmask:0xf unorm
+; GCN: s_waitcnt vmcnt(0)
+; GCN: image_store v[0:3], v4, s[16:23] dmask:0xf unorm
+define amdgpu_ps void @image_store_wait(<8 x i32> inreg %arg, <8 x i32> inreg %arg1, <8 x i32> inreg %arg2, <4 x float> %arg3, i32 %arg4) #0 {
 main_body:
-  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %3, i32 %4, <8 x i32> %0, i32 15, i1 0, i1 0, i1 0, i1 0)
-  %data = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %4, <8 x i32> %1, i32 15, i1 0, i1 0, i1 0, i1 0)
-  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %4, <8 x i32> %2, i32 15, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %arg3, i32 %arg4, <8 x i32> %arg, i32 15, i1 false, i1 false, i1 false, i1 false)
+  %data = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %arg4, <8 x i32> %arg1, i32 15, i1 false, i1 false, i1 false, i1 false)
+  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %arg4, <8 x i32> %arg2, i32 15, i1 false, i1 false, i1 false, i1 false)
  ret void
 }

@ -149,21 +147,22 @@ main_body:
 ; VI-LABEL: image_load_mmo
 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
-define amdgpu_ps void @image_load_mmo(float addrspace(3)* %lds, <2 x i32> %c, <8 x i32> inreg %rsrc) {
-  store float 0.0, float addrspace(3)* %lds
-  %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+define amdgpu_ps void @image_load_mmo(float addrspace(3)* %lds, <2 x i32> %c, <8 x i32> inreg %rsrc) #0 {
+bb:
+  store float 0.000000e+00, float addrspace(3)* %lds
+  %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
  %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
-  store float 0.0, float addrspace(3)* %tmp2
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tex, float %tex, float %tex, float %tex)
+  store float 0.000000e+00, float addrspace(3)* %tmp2
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex, float %tex, float %tex, float %tex, i1 true, i1 true) #0
  ret void
 }

 declare float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
 declare <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
 declare void @llvm.amdgcn.image.store.f32.i32.v8i32(float, i32, <8 x i32>, i32, i1, i1, i1, i1) #0
+
+
 declare void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
-
-
 declare void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0
 declare void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float>, <2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
 declare void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
@ -173,10 +172,9 @@ declare <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32, <8 x i32>, i32,
 declare <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
 declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
 declare <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1

-declare <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #0
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0

 attributes #0 = { nounwind }
 attributes #1 = { nounwind readonly }
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
@ -3,7 +3,6 @@
 ; RUN: llc -march=amdgcn -mcpu=kabini -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,16BANK %s
 ; RUN: llc -march=amdgcn -mcpu=stoney -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,16BANK %s

-
 ; GCN-LABEL: {{^}}v_interp:
 ; GCN-NOT: s_wqm
 ; GCN: s_mov_b32 m0, s{{[0-9]+}}
@ -11,17 +10,17 @@
 ; GCN-DAG: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr0.y{{$}}
 ; GCN-DAG: v_interp_p2_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr0.y{{$}}
 ; GCN-DAG: v_interp_mov_f32 v{{[0-9]+}}, p0, attr0.x{{$}}
-define amdgpu_ps void @v_interp(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x float>) {
+define amdgpu_ps void @v_interp(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x float> %arg4) #0 {
 main_body:
-  %i = extractelement <2 x float> %4, i32 0
-  %j = extractelement <2 x float> %4, i32 1
-  %p0_0 = call float @llvm.amdgcn.interp.p1(float %i, i32 0, i32 0, i32 %3)
-  %p1_0 = call float @llvm.amdgcn.interp.p2(float %p0_0, float %j, i32 0, i32 0, i32 %3)
-  %p0_1 = call float @llvm.amdgcn.interp.p1(float %i, i32 1, i32 0, i32 %3)
-  %p1_1 = call float @llvm.amdgcn.interp.p2(float %p0_1, float %j, i32 1, i32 0, i32 %3)
-  %const = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %3)
+  %i = extractelement <2 x float> %arg4, i32 0
+  %j = extractelement <2 x float> %arg4, i32 1
+  %p0_0 = call float @llvm.amdgcn.interp.p1(float %i, i32 0, i32 0, i32 %arg3)
+  %p1_0 = call float @llvm.amdgcn.interp.p2(float %p0_0, float %j, i32 0, i32 0, i32 %arg3)
+  %p0_1 = call float @llvm.amdgcn.interp.p1(float %i, i32 1, i32 0, i32 %arg3)
+  %p1_1 = call float @llvm.amdgcn.interp.p2(float %p0_1, float %j, i32 1, i32 0, i32 %arg3)
+  %const = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %arg3)
  %w = fadd float %p1_1, %const
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %p0_0, float %p0_0, float %p1_1, float %w)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_0, float %p0_0, float %p1_1, float %w, i1 true, i1 true) #0
  ret void
 }

@ -40,7 +39,8 @@ main_body:
 ; GCN-DAG: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr63.w{{$}}
 ; GCN-DAG: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr64.w{{$}}
 ; GCN-DAG: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr64.x{{$}}
-define amdgpu_ps void @v_interp_p1(float %i) {
+define amdgpu_ps void @v_interp_p1(float %i) #0 {
+bb:
  %p0_0 = call float @llvm.amdgcn.interp.p1(float %i, i32 0, i32 0, i32 256)
  %p0_1 = call float @llvm.amdgcn.interp.p1(float %i, i32 1, i32 0, i32 256)
  %p0_2 = call float @llvm.amdgcn.interp.p1(float %i, i32 2, i32 0, i32 256)
@ -80,7 +80,8 @@ define amdgpu_ps void @v_interp_p1(float %i) {
 ; GCN-DAG: v_interp_p2_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr63.x{{$}}
 ; GCN-DAG: v_interp_p2_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr64.x{{$}}
 ; GCN-DAG: v_interp_p2_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr64.x{{$}}
-define amdgpu_ps void @v_interp_p2(float %x, float %j) {
+define amdgpu_ps void @v_interp_p2(float %x, float %j) #0 {
+bb:
  %p2_0 = call float @llvm.amdgcn.interp.p2(float %x, float %j, i32 0, i32 0, i32 256)
  %p2_1 = call float @llvm.amdgcn.interp.p2(float %x, float %j, i32 1, i32 0, i32 256)
  %p2_2 = call float @llvm.amdgcn.interp.p2(float %x, float %j, i32 2, i32 0, i32 256)
@ -121,7 +122,8 @@ define amdgpu_ps void @v_interp_p2(float %x, float %j) {
 ; GCN-DAG: v_interp_mov_f32 v{{[0-9]+}}, p10, attr64.y{{$}}
 ; GCN-DAG: v_interp_mov_f32 v{{[0-9]+}}, invalid_param_3, attr64.y{{$}}
 ; GCN-DAG: v_interp_mov_f32 v{{[0-9]+}}, invalid_param_10, attr64.x{{$}}
-define amdgpu_ps void @v_interp_mov(float %x, float %j) {
+define amdgpu_ps void @v_interp_mov(float %x, float %j) #0 {
+bb:
  %mov_0 = call float @llvm.amdgcn.interp.mov(i32 0, i32 0, i32 0, i32 256)
  %mov_1 = call float @llvm.amdgcn.interp.mov(i32 1, i32 0, i32 0, i32 256)
  %mov_2 = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 256)
@ -164,12 +166,13 @@ define amdgpu_ps void @v_interp_mov(float %x, float %j) {
 ; VI-DAG: v_interp_mov_f32 v{{[0-9]+}}, p0, attr0.x{{$}}
 ; VI: s_mov_b32 m0, -1{{$}}
 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
-define amdgpu_ps void @v_interp_readnone(float addrspace(3)* %lds) {
-  store float 0.0, float addrspace(3)* %lds
+define amdgpu_ps void @v_interp_readnone(float addrspace(3)* %lds) #0 {
+bb:
+  store float 0.000000e+00, float addrspace(3)* %lds
  %tmp1 = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 0)
  %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
-  store float 0.0, float addrspace(3)* %tmp2
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp1, float %tmp1, float %tmp1, float %tmp1)
+  store float 0.000000e+00, float addrspace(3)* %tmp2
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
  ret void
 }

@ -178,43 +181,44 @@ define amdgpu_ps void @v_interp_readnone(float addrspace(3)* %lds) {

 ; GCN-LABEL: {{^}}v_interp_p1_bank16_bug:
 ; 16BANK-NOT: v_interp_p1_f32 [[DST:v[0-9]+]], [[DST]]
-define amdgpu_ps void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg13, [17 x <4 x i32>] addrspace(2)* byval %arg14, [34 x <8 x i32>] addrspace(2)* byval %arg15, float inreg %arg16, i32 inreg %arg17, <2 x i32> %arg18, <2 x i32> %arg19, <2 x i32> %arg20, <3 x i32> %arg21, <2 x i32> %arg22, <2 x i32> %arg23, <2 x i32> %arg24, float %arg25, float %arg26, float %arg27, float %arg28, float %arg29, float %arg30, i32 %arg31, float %arg32, float %arg33) {
+define amdgpu_ps void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg13, [17 x <4 x i32>] addrspace(2)* byval %arg14, [34 x <8 x i32>] addrspace(2)* byval %arg15, float inreg %arg16, i32 inreg %arg17, <2 x i32> %arg18, <2 x i32> %arg19, <2 x i32> %arg20, <3 x i32> %arg21, <2 x i32> %arg22, <2 x i32> %arg23, <2 x i32> %arg24, float %arg25, float %arg26, float %arg27, float %arg28, float %arg29, float %arg30, i32 %arg31, float %arg32, float %arg33) #0 {
 main_body:
  %i.i = extractelement <2 x i32> %arg19, i32 0
  %j.i = extractelement <2 x i32> %arg19, i32 1
  %i.f.i = bitcast i32 %i.i to float
  %j.f.i = bitcast i32 %j.i to float
-  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg17) #1
-  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg17) #1
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg17) #0
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg17) #0
  %i.i7 = extractelement <2 x i32> %arg19, i32 0
  %j.i8 = extractelement <2 x i32> %arg19, i32 1
  %i.f.i9 = bitcast i32 %i.i7 to float
  %j.f.i10 = bitcast i32 %j.i8 to float
-  %p1.i11 = call float @llvm.amdgcn.interp.p1(float %i.f.i9, i32 1, i32 0, i32 %arg17) #1
-  %p2.i12 = call float @llvm.amdgcn.interp.p2(float %p1.i11, float %j.f.i10, i32 1, i32 0, i32 %arg17) #1
+  %p1.i11 = call float @llvm.amdgcn.interp.p1(float %i.f.i9, i32 1, i32 0, i32 %arg17) #0
+  %p2.i12 = call float @llvm.amdgcn.interp.p2(float %p1.i11, float %j.f.i10, i32 1, i32 0, i32 %arg17) #0
  %i.i1 = extractelement <2 x i32> %arg19, i32 0
  %j.i2 = extractelement <2 x i32> %arg19, i32 1
  %i.f.i3 = bitcast i32 %i.i1 to float
  %j.f.i4 = bitcast i32 %j.i2 to float
-  %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 2, i32 0, i32 %arg17) #1
-  %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 0, i32 %arg17) #1
+  %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 2, i32 0, i32 %arg17) #0
+  %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 0, i32 %arg17) #0
  %tmp = call float @llvm.fabs.f32(float %p2.i)
  %tmp34 = call float @llvm.fabs.f32(float %p2.i12)
  %tmp35 = call float @llvm.fabs.f32(float %p2.i6)
  %tmp36 = call i32 @llvm.SI.packf16(float %tmp, float %tmp34)
-  %tmp37 = bitcast i32 %tmp36 to float
+  %tmp37 = bitcast i32 %tmp36 to <2 x half>
  %tmp38 = call i32 @llvm.SI.packf16(float %tmp35, float 1.000000e+00)
-  %tmp39 = bitcast i32 %tmp38 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp37, float %tmp39, float %tmp37, float %tmp39)
+  %tmp39 = bitcast i32 %tmp38 to <2 x half>
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp37, <2 x half> %tmp39, i1 true, i1 true) #0
  ret void
 }

-declare float @llvm.fabs.f32(float) #0
-declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0
-declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
-declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #0
-declare i32 @llvm.SI.packf16(float, float) #0
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
+declare i32 @llvm.SI.packf16(float, float) #1

-attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
@ -1,24 +1,22 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI  %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s

 ; GCN-LABEL: {{^}}mbcnt_intrinsics:
 ; GCN: v_mbcnt_lo_u32_b32_e64 [[LO:v[0-9]+]], -1, 0
 ; SI: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]]
 ; VI: v_mbcnt_hi_u32_b32_e64 {{v[0-9]+}}, -1, [[LO]]
-
-define amdgpu_ps void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) {
+define amdgpu_ps void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3) {
 main_body:
-  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1
-  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) #1
-  %4 = bitcast i32 %hi to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %4, float %4, float %4, float %4)
+  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) #0
+  %tmp = bitcast i32 %hi to float
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp, float %tmp, float %tmp, float %tmp, i1 true, i1 true) #1
  ret void
 }

-declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1

-declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #1 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
--- a/test/CodeGen/AMDGPU/lshl.ll
+++ b/test/CodeGen/AMDGPU/lshl.ll
@ -1,15 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s
-
-;CHECK: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1
-
-define void @test(i32 %p) {
-   %i = mul i32 %p, 2
-   %r = bitcast i32 %i to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
-   ret void
-}
-
-declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
--- a/test/CodeGen/AMDGPU/lshr.ll
+++ b/test/CodeGen/AMDGPU/lshr.ll
@ -1,15 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s
-
-;CHECK: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1
-
-define void @test(i32 %p) {
-   %i = udiv i32 %p, 2
-   %r = bitcast i32 %i to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
-   ret void
-}
-
-declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
--- a/test/CodeGen/AMDGPU/mulhu.ll
+++ b/test/CodeGen/AMDGPU/mulhu.ll
@ -1,17 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab
-;CHECK: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}}
-;CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0
-
-define void @test(i32 %p) {
-   %i = udiv i32 %p, 3
-   %r = bitcast i32 %i to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
-   ret void
-}
-
-declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
--- a/test/CodeGen/AMDGPU/ret.ll
+++ b/test/CodeGen/AMDGPU/ret.ll
@ -1,25 +1,24 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
 ; GCN-LABEL: {{^}}vgpr:
 ; GCN: v_mov_b32_e32 v1, v0
 ; GCN-DAG: v_add_f32_e32 v0, 1.0, v1
-; GCN-DAG: exp mrt0 v1, v1, v1, v1 done compr vm
+; GCN-DAG: exp mrt0 v1, v1, v1, v1 done vm
 ; GCN: s_waitcnt expcnt(0)
 ; GCN-NOT: s_endpgm
-define amdgpu_vs {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
-  %x = fadd float %3, 1.0
-  %a = insertvalue {float, float} undef, float %x, 0
-  %b = insertvalue {float, float} %a, float %3, 1
-  ret {float, float} %b
+define amdgpu_vs { float, float } @vgpr([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+bb:
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0
+  %x = fadd float %arg3, 1.000000e+00
+  %a = insertvalue { float, float } undef, float %x, 0
+  %b = insertvalue { float, float } %a, float %arg3, 1
+  ret { float, float } %b
 }

 ; GCN-LABEL: {{^}}vgpr_literal:
 ; GCN: v_mov_b32_e32 v4, v0
-; GCN: exp mrt0 v4, v4, v4, v4 done compr vm
+; GCN: exp mrt0 v4, v4, v4, v4 done vm

 ; GCN-DAG: v_mov_b32_e32 v0, 1.0
 ; GCN-DAG: v_mov_b32_e32 v1, 2.0
@ -27,12 +26,12 @@ define amdgpu_vs {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 i
 ; GCN-DAG: v_mov_b32_e32 v3, -1.0
 ; GCN: s_waitcnt expcnt(0)
 ; GCN-NOT: s_endpgm
-define amdgpu_vs {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
-  ret {float, float, float, float} {float 1.0, float 2.0, float 4.0, float -1.0}
+define amdgpu_vs { float, float, float, float } @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+bb:
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0
+  ret { float, float, float, float } { float 1.000000e+00, float 2.000000e+00, float 4.000000e+00, float -1.000000e+00 }
 }

-
 ; GCN: .long 165580
 ; GCN-NEXT: .long 562
 ; GCN-NEXT: .long 165584
@ -44,24 +43,24 @@ define amdgpu_vs {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addr
 ; GCN: v_mov_b32_e32 v3, v4
 ; GCN: v_mov_b32_e32 v4, v6
 ; GCN-NOT: s_endpgm
-define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-  %i0 = extractelement <2 x i32> %4, i32 0
-  %i1 = extractelement <2 x i32> %4, i32 1
-  %i2 = extractelement <2 x i32> %7, i32 0
-  %i3 = extractelement <2 x i32> %8, i32 0
+define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 {
+bb:
+  %i0 = extractelement <2 x i32> %arg4, i32 0
+  %i1 = extractelement <2 x i32> %arg4, i32 1
+  %i2 = extractelement <2 x i32> %arg7, i32 0
+  %i3 = extractelement <2 x i32> %arg8, i32 0
  %f0 = bitcast i32 %i0 to float
  %f1 = bitcast i32 %i1 to float
  %f2 = bitcast i32 %i2 to float
  %f3 = bitcast i32 %i3 to float
-  %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0
-  %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1
-  %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2
-  %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3
-  %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4
-  ret {float, float, float, float, float} %r4
+  %r0 = insertvalue { float, float, float, float, float } undef, float %f0, 0
+  %r1 = insertvalue { float, float, float, float, float } %r0, float %f1, 1
+  %r2 = insertvalue { float, float, float, float, float } %r1, float %f2, 2
+  %r3 = insertvalue { float, float, float, float, float } %r2, float %f3, 3
+  %r4 = insertvalue { float, float, float, float, float } %r3, float %arg12, 4
+  ret { float, float, float, float, float } %r4
 }

-
 ; GCN: .long 165580
 ; GCN-NEXT: .long 1
 ; GCN-NEXT: .long 165584
@ -69,11 +68,11 @@ define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i
 ; GCN-LABEL: {{^}}ps_input_ena_no_inputs:
 ; GCN: v_mov_b32_e32 v0, 1.0
 ; GCN-NOT: s_endpgm
-define amdgpu_ps float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-  ret float 1.0
+define amdgpu_ps float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 {
+bb:
+  ret float 1.000000e+00
 }

-
 ; GCN: .long 165580
 ; GCN-NEXT: .long 2081
 ; GCN-NEXT: .long 165584
@ -83,14 +82,14 @@ define amdgpu_ps float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byv
 ; GCN-DAG: v_mov_b32_e32 v1, v2
 ; GCN: v_mov_b32_e32 v2, v3
 ; GCN-NOT: s_endpgm
-define amdgpu_ps {float, <2 x float>} @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-  %f = bitcast <2 x i32> %8 to <2 x float>
-  %s = insertvalue {float, <2 x float>} undef, float %14, 0
-  %s1 = insertvalue {float, <2 x float>} %s, <2 x float> %f, 1
-  ret {float, <2 x float>} %s1
+define amdgpu_ps { float, <2 x float> } @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 {
+bb:
+  %f = bitcast <2 x i32> %arg8 to <2 x float>
+  %s = insertvalue { float, <2 x float> } undef, float %arg14, 0
+  %s1 = insertvalue { float, <2 x float> } %s, <2 x float> %f, 1
+  ret { float, <2 x float> } %s1
 }

-
 ; GCN: .long 165580
 ; GCN-NEXT: .long 562
 ; GCN-NEXT: .long 165584
@ -102,25 +101,24 @@ define amdgpu_ps {float, <2 x float>} @ps_input_ena_pos_w([9 x <16 x i8>] addrsp
 ; GCN-DAG: v_mov_b32_e32 v3, v6
 ; GCN-DAG: v_mov_b32_e32 v4, v8
 ; GCN-NOT: s_endpgm
-attributes #1 = { "InitialPSInputAddr"="1" }
-define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 {
-  %i0 = extractelement <2 x i32> %4, i32 0
-  %i1 = extractelement <2 x i32> %4, i32 1
-  %i2 = extractelement <2 x i32> %7, i32 0
-  %i3 = extractelement <2 x i32> %8, i32 0
+define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #2 {
+bb:
+  %i0 = extractelement <2 x i32> %arg4, i32 0
+  %i1 = extractelement <2 x i32> %arg4, i32 1
+  %i2 = extractelement <2 x i32> %arg7, i32 0
+  %i3 = extractelement <2 x i32> %arg8, i32 0
  %f0 = bitcast i32 %i0 to float
  %f1 = bitcast i32 %i1 to float
  %f2 = bitcast i32 %i2 to float
  %f3 = bitcast i32 %i3 to float
-  %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0
-  %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1
-  %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2
-  %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3
-  %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4
-  ret {float, float, float, float, float} %r4
+  %r0 = insertvalue { float, float, float, float, float } undef, float %f0, 0
+  %r1 = insertvalue { float, float, float, float, float } %r0, float %f1, 1
+  %r2 = insertvalue { float, float, float, float, float } %r1, float %f2, 2
+  %r3 = insertvalue { float, float, float, float, float } %r2, float %f3, 3
+  %r4 = insertvalue { float, float, float, float, float } %r3, float %arg12, 4
+  ret { float, float, float, float, float } %r4
 }

-
 ; GCN: .long 165580
 ; GCN-NEXT: .long 562
 ; GCN-NEXT: .long 165584
@ -132,25 +130,24 @@ define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr1([9 x <16 x i
 ; GCN: v_mov_b32_e32 v3, v8
 ; GCN: v_mov_b32_e32 v4, v12
 ; GCN-NOT: s_endpgm
-attributes #2 = { "InitialPSInputAddr"="119" }
-define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #2 {
-  %i0 = extractelement <2 x i32> %4, i32 0
-  %i1 = extractelement <2 x i32> %4, i32 1
-  %i2 = extractelement <2 x i32> %7, i32 0
-  %i3 = extractelement <2 x i32> %8, i32 0
+define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #3 {
+bb:
+  %i0 = extractelement <2 x i32> %arg4, i32 0
+  %i1 = extractelement <2 x i32> %arg4, i32 1
+  %i2 = extractelement <2 x i32> %arg7, i32 0
+  %i3 = extractelement <2 x i32> %arg8, i32 0
  %f0 = bitcast i32 %i0 to float
  %f1 = bitcast i32 %i1 to float
  %f2 = bitcast i32 %i2 to float
  %f3 = bitcast i32 %i3 to float
-  %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0
-  %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1
-  %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2
-  %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3
-  %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4
-  ret {float, float, float, float, float} %r4
+  %r0 = insertvalue { float, float, float, float, float } undef, float %f0, 0
+  %r1 = insertvalue { float, float, float, float, float } %r0, float %f1, 1
+  %r2 = insertvalue { float, float, float, float, float } %r1, float %f2, 2
+  %r3 = insertvalue { float, float, float, float, float } %r2, float %f3, 3
+  %r4 = insertvalue { float, float, float, float, float } %r3, float %arg12, 4
+  ret { float, float, float, float, float } %r4
 }

-
 ; GCN: .long 165580
 ; GCN-NEXT: .long 562
 ; GCN-NEXT: .long 165584
@ -162,38 +159,37 @@ define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr119([9 x <16 x
 ; GCN: v_mov_b32_e32 v3, v4
 ; GCN: v_mov_b32_e32 v4, v8
 ; GCN-NOT: s_endpgm
-attributes #3 = { "InitialPSInputAddr"="418" }
-define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #3 {
-  %i0 = extractelement <2 x i32> %4, i32 0
-  %i1 = extractelement <2 x i32> %4, i32 1
-  %i2 = extractelement <2 x i32> %7, i32 0
-  %i3 = extractelement <2 x i32> %8, i32 0
+define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #4 {
+bb:
+  %i0 = extractelement <2 x i32> %arg4, i32 0
+  %i1 = extractelement <2 x i32> %arg4, i32 1
+  %i2 = extractelement <2 x i32> %arg7, i32 0
+  %i3 = extractelement <2 x i32> %arg8, i32 0
  %f0 = bitcast i32 %i0 to float
  %f1 = bitcast i32 %i1 to float
  %f2 = bitcast i32 %i2 to float
  %f3 = bitcast i32 %i3 to float
-  %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0
-  %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1
-  %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2
-  %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3
-  %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4
-  ret {float, float, float, float, float} %r4
+  %r0 = insertvalue { float, float, float, float, float } undef, float %f0, 0
+  %r1 = insertvalue { float, float, float, float, float } %r0, float %f1, 1
+  %r2 = insertvalue { float, float, float, float, float } %r1, float %f2, 2
+  %r3 = insertvalue { float, float, float, float, float } %r2, float %f3, 3
+  %r4 = insertvalue { float, float, float, float, float } %r3, float %arg12, 4
+  ret { float, float, float, float, float } %r4
 }

-
 ; GCN-LABEL: {{^}}sgpr:
 ; GCN: s_add_i32 s0, s3, 2
 ; GCN: s_mov_b32 s2, s3
 ; GCN-NOT: s_endpgm
-define amdgpu_vs {i32, i32, i32} @sgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
-  %x = add i32 %2, 2
-  %a = insertvalue {i32, i32, i32} undef, i32 %x, 0
-  %b = insertvalue {i32, i32, i32} %a, i32 %1, 1
-  %c = insertvalue {i32, i32, i32} %a, i32 %2, 2
-  ret {i32, i32, i32} %c
+define amdgpu_vs { i32, i32, i32 } @sgpr([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+bb:
+  %x = add i32 %arg2, 2
+  %a = insertvalue { i32, i32, i32 } undef, i32 %x, 0
+  %b = insertvalue { i32, i32, i32 } %a, i32 %arg1, 1
+  %c = insertvalue { i32, i32, i32 } %a, i32 %arg2, 2
+  ret { i32, i32, i32 } %c
 }

-
 ; GCN-LABEL: {{^}}sgpr_literal:
 ; GCN: s_mov_b32 s0, 5
 ; GCN-NOT: s_mov_b32 s0, s0
@ -201,37 +197,37 @@ define amdgpu_vs {i32, i32, i32} @sgpr([9 x <16 x i8>] addrspace(2)* byval, i32
 ; GCN-DAG: s_mov_b32 s2, 7
 ; GCN-DAG: s_mov_b32 s3, 8
 ; GCN-NOT: s_endpgm
-define amdgpu_vs {i32, i32, i32, i32} @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
-  %x = add i32 %2, 2
-  ret {i32, i32, i32, i32} {i32 5, i32 6, i32 7, i32 8}
+define amdgpu_vs { i32, i32, i32, i32 } @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+bb:
+  %x = add i32 %arg2, 2
+  ret { i32, i32, i32, i32 } { i32 5, i32 6, i32 7, i32 8 }
 }

-
 ; GCN-LABEL: {{^}}both:
 ; GCN: v_mov_b32_e32 v1, v0
-; GCN-DAG: exp mrt0 v1, v1, v1, v1 done compr vm
+; GCN-DAG: exp mrt0 v1, v1, v1, v1 done vm
 ; GCN-DAG: v_add_f32_e32 v0, 1.0, v1
 ; GCN-DAG: s_add_i32 s0, s3, 2
 ; GCN-DAG: s_mov_b32 s1, s2
 ; GCN: s_mov_b32 s2, s3
 ; GCN: s_waitcnt expcnt(0)
 ; GCN-NOT: s_endpgm
-define amdgpu_vs {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
-  %v = fadd float %3, 1.0
-  %s = add i32 %2, 2
-  %a0 = insertvalue {float, i32, float, i32, i32} undef, float %v, 0
-  %a1 = insertvalue {float, i32, float, i32, i32} %a0, i32 %s, 1
-  %a2 = insertvalue {float, i32, float, i32, i32} %a1, float %3, 2
-  %a3 = insertvalue {float, i32, float, i32, i32} %a2, i32 %1, 3
-  %a4 = insertvalue {float, i32, float, i32, i32} %a3, i32 %2, 4
-  ret {float, i32, float, i32, i32} %a4
+define amdgpu_vs { float, i32, float, i32, i32 } @both([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+bb:
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0
+  %v = fadd float %arg3, 1.000000e+00
+  %s = add i32 %arg2, 2
+  %a0 = insertvalue { float, i32, float, i32, i32 } undef, float %v, 0
+  %a1 = insertvalue { float, i32, float, i32, i32 } %a0, i32 %s, 1
+  %a2 = insertvalue { float, i32, float, i32, i32 } %a1, float %arg3, 2
+  %a3 = insertvalue { float, i32, float, i32, i32 } %a2, i32 %arg1, 3
+  %a4 = insertvalue { float, i32, float, i32, i32 } %a3, i32 %arg2, 4
+  ret { float, i32, float, i32, i32 } %a4
 }

-
 ; GCN-LABEL: {{^}}structure_literal:
 ; GCN: v_mov_b32_e32 v3, v0
-; GCN: exp mrt0 v3, v3, v3, v3 done compr vm
+; GCN: exp mrt0 v3, v3, v3, v3 done vm

 ; GCN-DAG: v_mov_b32_e32 v0, 1.0
 ; GCN-DAG: s_mov_b32 s0, 2
@ -239,9 +235,16 @@ define amdgpu_vs {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2
 ; GCN-DAG: v_mov_b32_e32 v1, 2.0
 ; GCN-DAG: v_mov_b32_e32 v2, 4.0
 ; GCN: s_waitcnt expcnt(0)
-define amdgpu_vs {{float, i32}, {i32, <2 x float>}} @structure_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
-  ret {{float, i32}, {i32, <2 x float>}} {{float, i32} {float 1.0, i32 2}, {i32, <2 x float>} {i32 3, <2 x float> <float 2.0, float 4.0>}}
+define amdgpu_vs { { float, i32 }, { i32, <2 x float> } } @structure_literal([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+bb:
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0
+  ret { { float, i32 }, { i32, <2 x float> } } { { float, i32 } { float 1.000000e+00, i32 2 }, { i32, <2 x float> } { i32 3, <2 x float> <float 2.000000e+00, float 4.000000e+00> } }
 }

-attributes #0 = { nounwind "InitialPSInputAddr"="0" }
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "InitialPSInputAddr"="0" }
+attributes #2 = { nounwind "InitialPSInputAddr"="1" }
+attributes #3 = { nounwind "InitialPSInputAddr"="119" }
+attributes #4 = { nounwind "InitialPSInputAddr"="418" }
--- a/test/CodeGen/AMDGPU/seto.ll
+++ b/test/CodeGen/AMDGPU/seto.ll
@ -4,12 +4,9 @@
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: v_cmp_o_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]]
 ; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]]
-define void @main(float %p) {
+define amdgpu_ps float @main(float inreg %p) {
 main_body:
  %c = fcmp oeq float %p, %p
  %r = select i1 %c, float 1.000000e+00, float 0.000000e+00
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r)
-  ret void
+  ret float %r
 }
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
--- a/test/CodeGen/AMDGPU/setuo.ll
+++ b/test/CodeGen/AMDGPU/setuo.ll
@ -4,12 +4,9 @@
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: v_cmp_u_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]]
 ; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]]
-define void @main(float %p) {
+define amdgpu_ps float @main(float inreg %p) {
 main_body:
  %c = fcmp une float %p, %p
  %r = select i1 %c, float 1.000000e+00, float 0.000000e+00
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r)
-  ret void
+  ret float %r
 }
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
--- a/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/test/CodeGen/AMDGPU/sgpr-copy.ll
@ -1,13 +1,10 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s

-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
 ; CHECK-LABEL: {{^}}phi1:
 ; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
 ; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]]
-define amdgpu_ps void @phi1(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #1 {
+define amdgpu_ps void @phi1(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
@ -25,13 +22,13 @@ ELSE:                                             ; preds = %main_body
 ENDIF:                                            ; preds = %ELSE, %main_body
  %temp.0 = phi float [ %tmp26, %ELSE ], [ %tmp21, %main_body ]
  %tmp27 = fadd float %temp.0, %tmp23
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %tmp27, float %tmp27, float 0.000000e+00, float 1.000000e+00)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp27, float %tmp27, float 0.000000e+00, float 1.000000e+00, i1 true, i1 true) #0
  ret void
 }

 ; Make sure this program doesn't crash
 ; CHECK-LABEL: {{^}}phi2:
-define amdgpu_ps void @phi2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @phi2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #1 {
 main_body:
  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
@ -58,32 +55,32 @@ main_body:
  %j.i = extractelement <2 x i32> %arg5, i32 1
  %i.f.i = bitcast i32 %i.i to float
  %j.f.i = bitcast i32 %j.i to float
-  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg3) #0
-  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg3) #0
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg3) #1
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg3) #1
  %i.i19 = extractelement <2 x i32> %arg5, i32 0
  %j.i20 = extractelement <2 x i32> %arg5, i32 1
  %i.f.i21 = bitcast i32 %i.i19 to float
  %j.f.i22 = bitcast i32 %j.i20 to float
-  %p1.i23 = call float @llvm.amdgcn.interp.p1(float %i.f.i21, i32 1, i32 0, i32 %arg3) #0
-  %p2.i24 = call float @llvm.amdgcn.interp.p2(float %p1.i23, float %j.f.i22, i32 1, i32 0, i32 %arg3) #0
+  %p1.i23 = call float @llvm.amdgcn.interp.p1(float %i.f.i21, i32 1, i32 0, i32 %arg3) #1
+  %p2.i24 = call float @llvm.amdgcn.interp.p2(float %p1.i23, float %j.f.i22, i32 1, i32 0, i32 %arg3) #1
  %i.i13 = extractelement <2 x i32> %arg5, i32 0
  %j.i14 = extractelement <2 x i32> %arg5, i32 1
  %i.f.i15 = bitcast i32 %i.i13 to float
  %j.f.i16 = bitcast i32 %j.i14 to float
-  %p1.i17 = call float @llvm.amdgcn.interp.p1(float %i.f.i15, i32 0, i32 1, i32 %arg3) #0
-  %p2.i18 = call float @llvm.amdgcn.interp.p2(float %p1.i17, float %j.f.i16, i32 0, i32 1, i32 %arg3) #0
+  %p1.i17 = call float @llvm.amdgcn.interp.p1(float %i.f.i15, i32 0, i32 1, i32 %arg3) #1
+  %p2.i18 = call float @llvm.amdgcn.interp.p2(float %p1.i17, float %j.f.i16, i32 0, i32 1, i32 %arg3) #1
  %i.i7 = extractelement <2 x i32> %arg5, i32 0
  %j.i8 = extractelement <2 x i32> %arg5, i32 1
  %i.f.i9 = bitcast i32 %i.i7 to float
  %j.f.i10 = bitcast i32 %j.i8 to float
-  %p1.i11 = call float @llvm.amdgcn.interp.p1(float %i.f.i9, i32 1, i32 1, i32 %arg3) #0
-  %p2.i12 = call float @llvm.amdgcn.interp.p2(float %p1.i11, float %j.f.i10, i32 1, i32 1, i32 %arg3) #0
+  %p1.i11 = call float @llvm.amdgcn.interp.p1(float %i.f.i9, i32 1, i32 1, i32 %arg3) #1
+  %p2.i12 = call float @llvm.amdgcn.interp.p2(float %p1.i11, float %j.f.i10, i32 1, i32 1, i32 %arg3) #1
  %i.i1 = extractelement <2 x i32> %arg5, i32 0
  %j.i2 = extractelement <2 x i32> %arg5, i32 1
  %i.f.i3 = bitcast i32 %i.i1 to float
  %j.f.i4 = bitcast i32 %j.i2 to float
-  %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 2, i32 1, i32 %arg3) #0
-  %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 1, i32 %arg3) #0
+  %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 2, i32 1, i32 %arg3) #1
+  %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 1, i32 %arg3) #1
  %tmp45 = bitcast float %p2.i to i32
  %tmp46 = bitcast float %p2.i24 to i32
  %tmp47 = insertelement <2 x i32> undef, i32 %tmp45, i32 0
@ -168,16 +165,16 @@ ENDIF24:                                          ; preds = %IF25, %ENDIF
  %tmp111 = fsub float -0.000000e+00, %tmp105
  %tmp112 = fmul float %tmp111, %tmp106
  %tmp113 = call i32 @llvm.SI.packf16(float %tmp108, float %tmp110)
-  %tmp114 = bitcast i32 %tmp113 to float
+  %tmp114 = bitcast i32 %tmp113 to <2 x half>
  %tmp115 = call i32 @llvm.SI.packf16(float %tmp112, float 1.000000e+00)
-  %tmp116 = bitcast i32 %tmp115 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp114, float %tmp116, float %tmp114, float %tmp116)
+  %tmp116 = bitcast i32 %tmp115 to <2 x half>
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp114, <2 x half> %tmp116, i1 true, i1 true) #0
  ret void
 }

 ; We just want ot make sure the program doesn't crash
 ; CHECK-LABEL: {{^}}loop:
-define amdgpu_ps void @loop(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #1 {
+define amdgpu_ps void @loop(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
@ -204,7 +201,7 @@ LOOP:                                             ; preds = %ENDIF, %main_body
  br i1 %tmp33, label %IF, label %ENDIF

 IF:                                               ; preds = %LOOP
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp4.0, float %temp5.0, float %temp6.0, float 1.000000e+00)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %temp4.0, float %temp5.0, float %temp6.0, float 1.000000e+00, i1 true, i1 true) #0
  ret void

 ENDIF:                                            ; preds = %LOOP
@ -230,7 +227,7 @@ ENDIF:                                            ; preds = %LOOP
 ; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[SAMPLE_LO]]:[[SAMPLE_HI]]{{\]}}
 ; CHECK: exp
 ; CHECK: s_endpgm
-define amdgpu_ps void @sample_v3([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #1 {
+define amdgpu_ps void @sample_v3([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
 entry:
  %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0
  %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
@ -261,7 +258,7 @@ endif:                                            ; preds = %else, %if
  %val.0 = phi float [ %val.if.0, %if ], [ %val.else.0, %else ]
  %val.1 = phi float [ %val.if.1, %if ], [ %val.else.1, %else ]
  %val.2 = phi float [ %val.if.2, %if ], [ %val.else.2, %else ]
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %val.0, float %val.1, float %val.2, float 0.000000e+00)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %val.0, float %val.1, float %val.2, float 0.000000e+00, i1 true, i1 true) #0
  ret void
 }

@ -294,7 +291,7 @@ endif:                                            ; preds = %if1, %if0, %entry
 ; This test is just checking that we don't crash / assertion fail.
 ; CHECK-LABEL: {{^}}copy2:
 ; CHECK: s_endpgm
-define amdgpu_ps void @copy2([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #1 {
+define amdgpu_ps void @copy2([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
 entry:
  br label %LOOP68

@ -308,7 +305,7 @@ LOOP68:                                           ; preds = %ENDIF69, %entry
 IF70:                                             ; preds = %LOOP68
  %q = icmp ne i32 %l, 13
  %temp.8 = select i1 %q, float 1.000000e+00, float 0.000000e+00
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp.8, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %temp.8, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, i1 true, i1 true) #0
  ret void

 ENDIF69:                                          ; preds = %LOOP68
@ -330,7 +327,7 @@ ENDIF69:                                          ; preds = %LOOP68
 ; [[END]]:
 ; CHECK: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+}}:[[ADD]]{{\]}}
 ; CHECK: s_endpgm
-define amdgpu_ps void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #1 {
+define amdgpu_ps void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
 bb:
  %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i32 0, i32 0
  %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !3
@ -343,14 +340,14 @@ bb:
  %j.i = extractelement <2 x i32> %arg7, i32 1
  %i.f.i = bitcast i32 %i.i to float
  %j.f.i = bitcast i32 %j.i to float
-  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg5) #1
-  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg5) #1
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg5) #0
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg5) #0
  %i.i1 = extractelement <2 x i32> %arg7, i32 0
  %j.i2 = extractelement <2 x i32> %arg7, i32 1
  %i.f.i3 = bitcast i32 %i.i1 to float
  %j.f.i4 = bitcast i32 %j.i2 to float
-  %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 1, i32 0, i32 %arg5) #1
-  %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 1, i32 0, i32 %arg5) #1
+  %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 1, i32 0, i32 %arg5) #0
+  %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 1, i32 0, i32 %arg5) #0
  %tmp31 = bitcast float %tmp23 to i32
  %tmp36 = icmp ne i32 %tmp31, 0
  br i1 %tmp36, label %bb38, label %bb80
@ -377,80 +374,58 @@ bb80:                                             ; preds = %bb
 bb71:                                             ; preds = %bb80, %bb38
  %tmp72 = phi <4 x float> [ %tmp58, %bb38 ], [ %tmp87, %bb80 ]
  %tmp88 = extractelement <4 x float> %tmp72, i32 0
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp88, float %tmp88, float %tmp88, float %tmp88)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp88, float %tmp88, float %tmp88, float %tmp88, i1 true, i1 true) #0
  ret void
 }

 ; Check the the resource descriptor is stored in an sgpr.
 ; CHECK-LABEL: {{^}}mimg_srsrc_sgpr:
 ; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
-define amdgpu_ps void @mimg_srsrc_sgpr([34 x <8 x i32>] addrspace(2)* byval %arg) #1 {
+define amdgpu_ps void @mimg_srsrc_sgpr([34 x <8 x i32>] addrspace(2)* byval %arg) #0 {
 bb:
-  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
  %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i32 0, i32 %tid
  %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0
  %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
  %tmp10 = extractelement <4 x float> %tmp9, i32 0
  %tmp12 = call i32 @llvm.SI.packf16(float undef, float %tmp10)
-  %tmp13 = bitcast i32 %tmp12 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp13, float undef, float undef, float undef)
+  %tmp13 = bitcast i32 %tmp12 to <2 x half>
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp13, <2 x half> undef, i1 true, i1 true) #0
  ret void
 }

 ; Check the the sampler is stored in an sgpr.
 ; CHECK-LABEL: {{^}}mimg_ssamp_sgpr:
 ; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
-define amdgpu_ps void @mimg_ssamp_sgpr([17 x <4 x i32>] addrspace(2)* byval %arg) #1 {
+define amdgpu_ps void @mimg_ssamp_sgpr([17 x <4 x i32>] addrspace(2)* byval %arg) #0 {
 bb:
-  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
  %tmp7 = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i32 0, i32 %tid
  %tmp8 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp7, align 16, !tbaa !0
  %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> undef, <4 x i32> %tmp8, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
  %tmp10 = extractelement <4 x float> %tmp9, i32 0
  %tmp12 = call i32 @llvm.SI.packf16(float %tmp10, float undef)
-  %tmp13 = bitcast i32 %tmp12 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp13, float undef, float undef, float undef)
+  %tmp13 = bitcast i32 %tmp12 to <2 x half>
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp13, <2 x half> undef, i1 true, i1 true) #0
  ret void
 }

-; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #0
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.amdgcn.rsq.f32(float) #1
+declare float @llvm.exp2.f32(float) #1
+declare float @llvm.pow.f32(float, float) #1
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0

-; Function Attrs: nounwind readnone
-declare float @llvm.fabs.f32(float) #0
+declare i32 @llvm.SI.packf16(float, float) #1
+declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1

-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <8 x i32>, <16 x i8>, i32) #0
-
-; Function Attrs: nounwind readnone
-declare float @llvm.amdgcn.rsq.f32(float) #0
-
-; Function Attrs: nounwind readnone
-declare float @llvm.exp2.f32(float) #0
-
-; Function Attrs: nounwind readnone
-declare float @llvm.pow.f32(float, float) #0
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #0
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
-
-; Function Attrs: nounwind readnone
-declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0
-
-; Function Attrs: nounwind readnone
-declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
-
-; Function Attrs: nounwind readnone
-declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #0
-
-attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind }
-attributes #2 = { nounwind readonly }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }

 !0 = !{!1, !1, i64 0, i32 1}
 !1 = !{!"const", !2}
--- a/test/CodeGen/AMDGPU/shl.ll
+++ b/test/CodeGen/AMDGPU/shl.ll
@ -1,6 +1,6 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; XUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s

 declare i32 @llvm.r600.read.tidig.x() #0

@ -466,4 +466,12 @@ define void @s_shl_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 a
  ret void
 }

+; FUNC-LABEL: {{^}}test_mul2:
+; GCN: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1
+define void @test_mul2(i32 %p) {
+   %i = mul i32 %p, 2
+   store volatile i32 %i, i32 addrspace(1)* undef
+   ret void
+}
+
 attributes #0 = { nounwind readnone }
--- a/test/CodeGen/AMDGPU/si-literal-folding.ll
+++ b/test/CodeGen/AMDGPU/si-literal-folding.ll
@ -1,14 +0,0 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-
-; GCN-LABEL: {{^}}main:
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f4353f8, v{{[0-9]+}}
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0xbf4353f8, v{{[0-9]+}}
-define amdgpu_vs void @main(float) {
-main_body:
-  %1 = fmul float %0, 0x3FE86A7F00000000
-  %2 = fmul float %0, 0xBFE86A7F00000000
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %1, float %1, float %2, float %2)
-  ret void
-}
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
--- a/test/CodeGen/AMDGPU/si-lod-bias.ll
+++ b/test/CodeGen/AMDGPU/si-lod-bias.ll
@ -1,11 +1,11 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

 ; This shader has the potential to generated illegal VGPR to SGPR copies if
 ; the wrong register class is used for the REG_SEQUENCE instructions.

-; CHECK: {{^}}main:
-; CHECK: image_sample_b v{{\[[0-9]:[0-9]\]}}, v{{\[[0-9]:[0-9]\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf
+; GCN-LABEL: {{^}}main:
+; GCN: image_sample_b v{{\[[0-9]:[0-9]\]}}, v{{\[[0-9]:[0-9]\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf
 define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
@ -40,26 +40,16 @@ main_body:
  %tmp37 = extractelement <4 x float> %tmp35, i32 1
  %tmp38 = extractelement <4 x float> %tmp35, i32 2
  %tmp39 = extractelement <4 x float> %tmp35, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %tmp36, float %tmp37, float %tmp38, float %tmp39)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp36, float %tmp37, float %tmp38, float %tmp39, i1 true, i1 true) #0
  ret void
 }

-; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-; Function Attrs: nounwind readnone
 declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
-
-; Function Attrs: nounwind readnone
 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0

-; Function Attrs: nounwind readnone
-declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1

 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
--- a/test/CodeGen/AMDGPU/si-scheduler.ll
+++ b/test/CodeGen/AMDGPU/si-scheduler.ll
@ -3,7 +3,7 @@
 ; The only way the subtarget knows that the si machine scheduler is being used
 ; is to specify -mattr=si-scheduler.  If we just pass --misched=si, the backend
 ; won't know what scheduler we are using.
-; RUN: llc -march=amdgcn -mcpu=SI --misched=si -mattr=si-scheduler < %s | FileCheck %s
+; RUN: llc -march=amdgcn --misched=si -mattr=si-scheduler < %s | FileCheck %s

 ; The test checks the "si" machine scheduler pass works correctly.

@ -16,7 +16,7 @@
 ; CHECK: s_waitcnt vmcnt(0)
 ; CHECK: exp
 ; CHECK: s_endpgm
-define amdgpu_ps void @main([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) {
+define amdgpu_ps void @main([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
 main_body:
  %tmp = bitcast [34 x <8 x i32>] addrspace(2)* %arg3 to <32 x i8> addrspace(2)*
  %tmp22 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp, align 32, !tbaa !0
@ -46,29 +46,22 @@ main_body:
  %tmp34 = extractelement <4 x float> %tmp31, i32 2
  %tmp35 = extractelement <4 x float> %tmp31, i32 3
  %tmp36 = call i32 @llvm.SI.packf16(float %tmp32, float %tmp33)
-  %tmp37 = bitcast i32 %tmp36 to float
+  %tmp37 = bitcast i32 %tmp36 to <2 x half>
  %tmp38 = call i32 @llvm.SI.packf16(float %tmp34, float %tmp35)
-  %tmp39 = bitcast i32 %tmp38 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp37, float %tmp39, float %tmp37, float %tmp39)
+  %tmp39 = bitcast i32 %tmp38 to <2 x half>
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp37, <2 x half> %tmp39, i1 true, i1 false) #0
  ret void
 }

-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0

-; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #0
+declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare i32 @llvm.SI.packf16(float, float) #1

-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-; Function Attrs: nounwind readnone
-declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0
-
-; Function Attrs: nounwind readnone
-declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
-
-attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }

 !0 = !{!1, !1, i64 0, i32 1}
 !1 = !{!"const", !2}
--- a/test/CodeGen/AMDGPU/si-sgpr-spill.ll
+++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
@ -732,10 +732,10 @@ IF67:                                             ; preds = %LOOP65
  %tmp579 = fmul float %tmp574, %tmp45
  %tmp580 = fadd float %tmp579, %tmp556
  %tmp581 = call i32 @llvm.SI.packf16(float %tmp576, float %tmp578)
-  %tmp582 = bitcast i32 %tmp581 to float
+  %tmp582 = bitcast i32 %tmp581 to <2 x half>
  %tmp583 = call i32 @llvm.SI.packf16(float %tmp580, float %tmp282)
-  %tmp584 = bitcast i32 %tmp583 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp582, float %tmp584, float %tmp582, float %tmp584)
+  %tmp584 = bitcast i32 %tmp583 to <2 x half>
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp582, <2 x half> %tmp584, i1 true, i1 true) #0
  ret void

 ENDIF66:                                          ; preds = %LOOP65
@ -1814,10 +1814,10 @@ ENDIF209:                                         ; preds = %ELSE214, %ELSE211,
  %max.0.i1 = call float @llvm.maxnum.f32(float %tmp774, float 0.000000e+00)
  %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00)
  %tmp776 = call i32 @llvm.SI.packf16(float %tmp768, float %tmp770)
-  %tmp777 = bitcast i32 %tmp776 to float
+  %tmp777 = bitcast i32 %tmp776 to <2 x half>
  %tmp778 = call i32 @llvm.SI.packf16(float %tmp772, float %clamp.i2)
-  %tmp779 = bitcast i32 %tmp778 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp777, float %tmp779, float %tmp777, float %tmp779)
+  %tmp779 = bitcast i32 %tmp778 to <2 x half>
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp777, <2 x half> %tmp779, i1 true, i1 true) #0
  ret void

 ELSE214:                                          ; preds = %ELSE211
@ -1835,11 +1835,11 @@ ELSE214:                                          ; preds = %ELSE211

 declare float @llvm.exp2.f32(float) #1
 declare float @llvm.ceil.f32(float) #1
-declare float @llvm.amdgcn.rsq.f32(float) #1
 declare float @llvm.fabs.f32(float) #1
 declare float @llvm.pow.f32(float, float) #1
 declare float @llvm.minnum.f32(float, float) #1
 declare float @llvm.maxnum.f32(float, float) #1
+declare float @llvm.amdgcn.rsq.f32(float) #1
 declare float @llvm.amdgcn.cubeid(float, float, float) #1
 declare float @llvm.amdgcn.cubesc(float, float, float) #1
 declare float @llvm.amdgcn.cubetc(float, float, float) #1
@ -1848,13 +1848,14 @@ declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
 declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
+
 declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
 declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
 declare <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
 declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
 declare float @llvm.SI.load.const(<16 x i8>, i32) #1
 declare i32 @llvm.SI.packf16(float, float) #1
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)

 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
--- a/test/CodeGen/AMDGPU/si-spill-cf.ll
+++ b/test/CodeGen/AMDGPU/si-spill-cf.ll
@ -6,270 +6,271 @@

 ; SI: s_or_b64 exec, exec, [[SAVED:s\[[0-9]+:[0-9]+\]|[a-z]+]]
 ; SI-NOT: v_readlane_b32 [[SAVED]]
+
 define amdgpu_ps void @main() #0 {
 main_body:
-  %0 = call float @llvm.SI.load.const(<16 x i8> undef, i32 16)
-  %1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32)
-  %2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 80)
-  %3 = call float @llvm.SI.load.const(<16 x i8> undef, i32 84)
-  %4 = call float @llvm.SI.load.const(<16 x i8> undef, i32 88)
-  %5 = call float @llvm.SI.load.const(<16 x i8> undef, i32 96)
-  %6 = call float @llvm.SI.load.const(<16 x i8> undef, i32 100)
-  %7 = call float @llvm.SI.load.const(<16 x i8> undef, i32 104)
-  %8 = call float @llvm.SI.load.const(<16 x i8> undef, i32 112)
-  %9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 116)
-  %10 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120)
-  %11 = call float @llvm.SI.load.const(<16 x i8> undef, i32 128)
-  %12 = call float @llvm.SI.load.const(<16 x i8> undef, i32 132)
-  %13 = call float @llvm.SI.load.const(<16 x i8> undef, i32 136)
-  %14 = call float @llvm.SI.load.const(<16 x i8> undef, i32 144)
-  %15 = call float @llvm.SI.load.const(<16 x i8> undef, i32 148)
-  %16 = call float @llvm.SI.load.const(<16 x i8> undef, i32 152)
-  %17 = call float @llvm.SI.load.const(<16 x i8> undef, i32 160)
-  %18 = call float @llvm.SI.load.const(<16 x i8> undef, i32 164)
-  %19 = call float @llvm.SI.load.const(<16 x i8> undef, i32 168)
-  %20 = call float @llvm.SI.load.const(<16 x i8> undef, i32 176)
-  %21 = call float @llvm.SI.load.const(<16 x i8> undef, i32 180)
-  %22 = call float @llvm.SI.load.const(<16 x i8> undef, i32 184)
-  %23 = call float @llvm.SI.load.const(<16 x i8> undef, i32 192)
-  %24 = call float @llvm.SI.load.const(<16 x i8> undef, i32 196)
-  %25 = call float @llvm.SI.load.const(<16 x i8> undef, i32 200)
-  %26 = call float @llvm.SI.load.const(<16 x i8> undef, i32 208)
-  %27 = call float @llvm.SI.load.const(<16 x i8> undef, i32 212)
-  %28 = call float @llvm.SI.load.const(<16 x i8> undef, i32 216)
-  %29 = call float @llvm.SI.load.const(<16 x i8> undef, i32 224)
-  %30 = call float @llvm.SI.load.const(<16 x i8> undef, i32 228)
-  %31 = call float @llvm.SI.load.const(<16 x i8> undef, i32 232)
-  %32 = call float @llvm.SI.load.const(<16 x i8> undef, i32 240)
-  %33 = call float @llvm.SI.load.const(<16 x i8> undef, i32 244)
-  %34 = call float @llvm.SI.load.const(<16 x i8> undef, i32 248)
-  %35 = call float @llvm.SI.load.const(<16 x i8> undef, i32 256)
-  %36 = call float @llvm.SI.load.const(<16 x i8> undef, i32 260)
-  %37 = call float @llvm.SI.load.const(<16 x i8> undef, i32 264)
-  %38 = call float @llvm.SI.load.const(<16 x i8> undef, i32 272)
-  %39 = call float @llvm.SI.load.const(<16 x i8> undef, i32 276)
-  %40 = call float @llvm.SI.load.const(<16 x i8> undef, i32 280)
-  %41 = call float @llvm.SI.load.const(<16 x i8> undef, i32 288)
-  %42 = call float @llvm.SI.load.const(<16 x i8> undef, i32 292)
-  %43 = call float @llvm.SI.load.const(<16 x i8> undef, i32 296)
-  %44 = call float @llvm.SI.load.const(<16 x i8> undef, i32 304)
-  %45 = call float @llvm.SI.load.const(<16 x i8> undef, i32 308)
-  %46 = call float @llvm.SI.load.const(<16 x i8> undef, i32 312)
-  %47 = call float @llvm.SI.load.const(<16 x i8> undef, i32 320)
-  %48 = call float @llvm.SI.load.const(<16 x i8> undef, i32 324)
-  %49 = call float @llvm.SI.load.const(<16 x i8> undef, i32 328)
-  %50 = call float @llvm.SI.load.const(<16 x i8> undef, i32 336)
-  %51 = call float @llvm.SI.load.const(<16 x i8> undef, i32 340)
-  %52 = call float @llvm.SI.load.const(<16 x i8> undef, i32 344)
-  %53 = call float @llvm.SI.load.const(<16 x i8> undef, i32 352)
-  %54 = call float @llvm.SI.load.const(<16 x i8> undef, i32 356)
-  %55 = call float @llvm.SI.load.const(<16 x i8> undef, i32 360)
-  %56 = call float @llvm.SI.load.const(<16 x i8> undef, i32 368)
-  %57 = call float @llvm.SI.load.const(<16 x i8> undef, i32 372)
-  %58 = call float @llvm.SI.load.const(<16 x i8> undef, i32 376)
-  %59 = call float @llvm.SI.load.const(<16 x i8> undef, i32 384)
-  %60 = call float @llvm.SI.load.const(<16 x i8> undef, i32 388)
-  %61 = call float @llvm.SI.load.const(<16 x i8> undef, i32 392)
-  %62 = call float @llvm.SI.load.const(<16 x i8> undef, i32 400)
-  %63 = call float @llvm.SI.load.const(<16 x i8> undef, i32 404)
-  %64 = call float @llvm.SI.load.const(<16 x i8> undef, i32 408)
-  %65 = call float @llvm.SI.load.const(<16 x i8> undef, i32 416)
-  %66 = call float @llvm.SI.load.const(<16 x i8> undef, i32 420)
+  %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 16)
+  %tmp1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32)
+  %tmp2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 80)
+  %tmp3 = call float @llvm.SI.load.const(<16 x i8> undef, i32 84)
+  %tmp4 = call float @llvm.SI.load.const(<16 x i8> undef, i32 88)
+  %tmp5 = call float @llvm.SI.load.const(<16 x i8> undef, i32 96)
+  %tmp6 = call float @llvm.SI.load.const(<16 x i8> undef, i32 100)
+  %tmp7 = call float @llvm.SI.load.const(<16 x i8> undef, i32 104)
+  %tmp8 = call float @llvm.SI.load.const(<16 x i8> undef, i32 112)
+  %tmp9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 116)
+  %tmp10 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120)
+  %tmp11 = call float @llvm.SI.load.const(<16 x i8> undef, i32 128)
+  %tmp12 = call float @llvm.SI.load.const(<16 x i8> undef, i32 132)
+  %tmp13 = call float @llvm.SI.load.const(<16 x i8> undef, i32 136)
+  %tmp14 = call float @llvm.SI.load.const(<16 x i8> undef, i32 144)
+  %tmp15 = call float @llvm.SI.load.const(<16 x i8> undef, i32 148)
+  %tmp16 = call float @llvm.SI.load.const(<16 x i8> undef, i32 152)
+  %tmp17 = call float @llvm.SI.load.const(<16 x i8> undef, i32 160)
+  %tmp18 = call float @llvm.SI.load.const(<16 x i8> undef, i32 164)
+  %tmp19 = call float @llvm.SI.load.const(<16 x i8> undef, i32 168)
+  %tmp20 = call float @llvm.SI.load.const(<16 x i8> undef, i32 176)
+  %tmp21 = call float @llvm.SI.load.const(<16 x i8> undef, i32 180)
+  %tmp22 = call float @llvm.SI.load.const(<16 x i8> undef, i32 184)
+  %tmp23 = call float @llvm.SI.load.const(<16 x i8> undef, i32 192)
+  %tmp24 = call float @llvm.SI.load.const(<16 x i8> undef, i32 196)
+  %tmp25 = call float @llvm.SI.load.const(<16 x i8> undef, i32 200)
+  %tmp26 = call float @llvm.SI.load.const(<16 x i8> undef, i32 208)
+  %tmp27 = call float @llvm.SI.load.const(<16 x i8> undef, i32 212)
+  %tmp28 = call float @llvm.SI.load.const(<16 x i8> undef, i32 216)
+  %tmp29 = call float @llvm.SI.load.const(<16 x i8> undef, i32 224)
+  %tmp30 = call float @llvm.SI.load.const(<16 x i8> undef, i32 228)
+  %tmp31 = call float @llvm.SI.load.const(<16 x i8> undef, i32 232)
+  %tmp32 = call float @llvm.SI.load.const(<16 x i8> undef, i32 240)
+  %tmp33 = call float @llvm.SI.load.const(<16 x i8> undef, i32 244)
+  %tmp34 = call float @llvm.SI.load.const(<16 x i8> undef, i32 248)
+  %tmp35 = call float @llvm.SI.load.const(<16 x i8> undef, i32 256)
+  %tmp36 = call float @llvm.SI.load.const(<16 x i8> undef, i32 260)
+  %tmp37 = call float @llvm.SI.load.const(<16 x i8> undef, i32 264)
+  %tmp38 = call float @llvm.SI.load.const(<16 x i8> undef, i32 272)
+  %tmp39 = call float @llvm.SI.load.const(<16 x i8> undef, i32 276)
+  %tmp40 = call float @llvm.SI.load.const(<16 x i8> undef, i32 280)
+  %tmp41 = call float @llvm.SI.load.const(<16 x i8> undef, i32 288)
+  %tmp42 = call float @llvm.SI.load.const(<16 x i8> undef, i32 292)
+  %tmp43 = call float @llvm.SI.load.const(<16 x i8> undef, i32 296)
+  %tmp44 = call float @llvm.SI.load.const(<16 x i8> undef, i32 304)
+  %tmp45 = call float @llvm.SI.load.const(<16 x i8> undef, i32 308)
+  %tmp46 = call float @llvm.SI.load.const(<16 x i8> undef, i32 312)
+  %tmp47 = call float @llvm.SI.load.const(<16 x i8> undef, i32 320)
+  %tmp48 = call float @llvm.SI.load.const(<16 x i8> undef, i32 324)
+  %tmp49 = call float @llvm.SI.load.const(<16 x i8> undef, i32 328)
+  %tmp50 = call float @llvm.SI.load.const(<16 x i8> undef, i32 336)
+  %tmp51 = call float @llvm.SI.load.const(<16 x i8> undef, i32 340)
+  %tmp52 = call float @llvm.SI.load.const(<16 x i8> undef, i32 344)
+  %tmp53 = call float @llvm.SI.load.const(<16 x i8> undef, i32 352)
+  %tmp54 = call float @llvm.SI.load.const(<16 x i8> undef, i32 356)
+  %tmp55 = call float @llvm.SI.load.const(<16 x i8> undef, i32 360)
+  %tmp56 = call float @llvm.SI.load.const(<16 x i8> undef, i32 368)
+  %tmp57 = call float @llvm.SI.load.const(<16 x i8> undef, i32 372)
+  %tmp58 = call float @llvm.SI.load.const(<16 x i8> undef, i32 376)
+  %tmp59 = call float @llvm.SI.load.const(<16 x i8> undef, i32 384)
+  %tmp60 = call float @llvm.SI.load.const(<16 x i8> undef, i32 388)
+  %tmp61 = call float @llvm.SI.load.const(<16 x i8> undef, i32 392)
+  %tmp62 = call float @llvm.SI.load.const(<16 x i8> undef, i32 400)
+  %tmp63 = call float @llvm.SI.load.const(<16 x i8> undef, i32 404)
+  %tmp64 = call float @llvm.SI.load.const(<16 x i8> undef, i32 408)
+  %tmp65 = call float @llvm.SI.load.const(<16 x i8> undef, i32 416)
+  %tmp66 = call float @llvm.SI.load.const(<16 x i8> undef, i32 420)
  br label %LOOP

 LOOP:                                             ; preds = %ENDIF2795, %main_body
  %temp894.0 = phi float [ 0.000000e+00, %main_body ], [ %temp894.1, %ENDIF2795 ]
  %temp18.0 = phi float [ undef, %main_body ], [ %temp18.1, %ENDIF2795 ]
  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-  %67 = icmp sgt i32 %tid, 4
-  br i1 %67, label %ENDLOOP, label %ENDIF
+  %tmp67 = icmp sgt i32 %tid, 4
+  br i1 %tmp67, label %ENDLOOP, label %ENDIF

 ENDLOOP:                                          ; preds = %ELSE2566, %LOOP
-  %one.sub.a.i = fsub float 1.000000e+00, %0
+  %one.sub.a.i = fsub float 1.000000e+00, %tmp
  %one.sub.ac.i = fmul float %one.sub.a.i, undef
  %result.i = fadd float fmul (float undef, float undef), %one.sub.ac.i
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float undef, float %result.i, float undef, float 1.000000e+00)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float undef, float %result.i, float undef, float 1.000000e+00, i1 true, i1 true) #0
  ret void

 ENDIF:                                            ; preds = %LOOP
-  %68 = fsub float %2, undef
-  %69 = fsub float %3, undef
-  %70 = fsub float %4, undef
-  %71 = fmul float %68, 0.000000e+00
-  %72 = fmul float %69, undef
-  %73 = fmul float %70, undef
-  %74 = fsub float %6, undef
-  %75 = fsub float %7, undef
-  %76 = fmul float %74, undef
-  %77 = fmul float %75, 0.000000e+00
-  %78 = call float @llvm.minnum.f32(float %73, float %77)
-  %79 = call float @llvm.maxnum.f32(float %71, float 0.000000e+00)
-  %80 = call float @llvm.maxnum.f32(float %72, float %76)
-  %81 = call float @llvm.maxnum.f32(float undef, float %78)
-  %82 = call float @llvm.minnum.f32(float %79, float %80)
-  %83 = call float @llvm.minnum.f32(float %82, float undef)
-  %84 = fsub float %14, undef
-  %85 = fsub float %15, undef
-  %86 = fsub float %16, undef
-  %87 = fmul float %84, undef
-  %88 = fmul float %85, undef
-  %89 = fmul float %86, undef
-  %90 = fsub float %17, undef
-  %91 = fsub float %18, undef
-  %92 = fsub float %19, undef
-  %93 = fmul float %90, 0.000000e+00
-  %94 = fmul float %91, undef
-  %95 = fmul float %92, undef
-  %96 = call float @llvm.minnum.f32(float %88, float %94)
-  %97 = call float @llvm.maxnum.f32(float %87, float %93)
-  %98 = call float @llvm.maxnum.f32(float %89, float %95)
-  %99 = call float @llvm.maxnum.f32(float undef, float %96)
-  %100 = call float @llvm.maxnum.f32(float %99, float undef)
-  %101 = call float @llvm.minnum.f32(float %97, float undef)
-  %102 = call float @llvm.minnum.f32(float %101, float %98)
-  %103 = fsub float %30, undef
-  %104 = fsub float %31, undef
-  %105 = fmul float %103, 0.000000e+00
-  %106 = fmul float %104, 0.000000e+00
-  %107 = call float @llvm.minnum.f32(float undef, float %105)
-  %108 = call float @llvm.maxnum.f32(float undef, float %106)
-  %109 = call float @llvm.maxnum.f32(float undef, float %107)
-  %110 = call float @llvm.maxnum.f32(float %109, float undef)
-  %111 = call float @llvm.minnum.f32(float undef, float %108)
-  %112 = fsub float %32, undef
-  %113 = fsub float %33, undef
-  %114 = fsub float %34, undef
-  %115 = fmul float %112, 0.000000e+00
-  %116 = fmul float %113, undef
-  %117 = fmul float %114, undef
-  %118 = fsub float %35, undef
-  %119 = fsub float %36, undef
-  %120 = fsub float %37, undef
-  %121 = fmul float %118, undef
-  %122 = fmul float %119, undef
-  %123 = fmul float %120, undef
-  %124 = call float @llvm.minnum.f32(float %115, float %121)
-  %125 = call float @llvm.minnum.f32(float %116, float %122)
-  %126 = call float @llvm.minnum.f32(float %117, float %123)
-  %127 = call float @llvm.maxnum.f32(float %124, float %125)
-  %128 = call float @llvm.maxnum.f32(float %127, float %126)
-  %129 = fsub float %38, undef
-  %130 = fsub float %39, undef
-  %131 = fsub float %40, undef
-  %132 = fmul float %129, 0.000000e+00
-  %133 = fmul float %130, undef
-  %134 = fmul float %131, undef
-  %135 = fsub float %41, undef
-  %136 = fsub float %42, undef
-  %137 = fsub float %43, undef
-  %138 = fmul float %135, undef
-  %139 = fmul float %136, undef
-  %140 = fmul float %137, undef
-  %141 = call float @llvm.minnum.f32(float %132, float %138)
-  %142 = call float @llvm.minnum.f32(float %133, float %139)
-  %143 = call float @llvm.minnum.f32(float %134, float %140)
-  %144 = call float @llvm.maxnum.f32(float %141, float %142)
-  %145 = call float @llvm.maxnum.f32(float %144, float %143)
-  %146 = fsub float %44, undef
-  %147 = fsub float %45, undef
-  %148 = fsub float %46, undef
-  %149 = fmul float %146, 0.000000e+00
-  %150 = fmul float %147, 0.000000e+00
-  %151 = fmul float %148, undef
-  %152 = fsub float %47, undef
-  %153 = fsub float %48, undef
-  %154 = fsub float %49, undef
-  %155 = fmul float %152, undef
-  %156 = fmul float %153, 0.000000e+00
-  %157 = fmul float %154, undef
-  %158 = call float @llvm.minnum.f32(float %149, float %155)
-  %159 = call float @llvm.minnum.f32(float %150, float %156)
-  %160 = call float @llvm.minnum.f32(float %151, float %157)
-  %161 = call float @llvm.maxnum.f32(float %158, float %159)
-  %162 = call float @llvm.maxnum.f32(float %161, float %160)
-  %163 = fsub float %50, undef
-  %164 = fsub float %51, undef
-  %165 = fsub float %52, undef
-  %166 = fmul float %163, undef
-  %167 = fmul float %164, 0.000000e+00
-  %168 = fmul float %165, 0.000000e+00
-  %169 = fsub float %53, undef
-  %170 = fsub float %54, undef
-  %171 = fsub float %55, undef
-  %172 = fdiv float 1.000000e+00, %temp18.0
-  %173 = fmul float %169, undef
-  %174 = fmul float %170, undef
-  %175 = fmul float %171, %172
-  %176 = call float @llvm.minnum.f32(float %166, float %173)
-  %177 = call float @llvm.minnum.f32(float %167, float %174)
-  %178 = call float @llvm.minnum.f32(float %168, float %175)
-  %179 = call float @llvm.maxnum.f32(float %176, float %177)
-  %180 = call float @llvm.maxnum.f32(float %179, float %178)
-  %181 = fsub float %62, undef
-  %182 = fsub float %63, undef
-  %183 = fsub float %64, undef
-  %184 = fmul float %181, 0.000000e+00
-  %185 = fmul float %182, undef
-  %186 = fmul float %183, undef
-  %187 = fsub float %65, undef
-  %188 = fsub float %66, undef
-  %189 = fmul float %187, undef
-  %190 = fmul float %188, undef
-  %191 = call float @llvm.maxnum.f32(float %184, float %189)
-  %192 = call float @llvm.maxnum.f32(float %185, float %190)
-  %193 = call float @llvm.maxnum.f32(float %186, float undef)
-  %194 = call float @llvm.minnum.f32(float %191, float %192)
-  %195 = call float @llvm.minnum.f32(float %194, float %193)
-  %.temp292.7 = select i1 undef, float %162, float undef
-  %temp292.9 = select i1 false, float %180, float %.temp292.7
+  %tmp68 = fsub float %tmp2, undef
+  %tmp69 = fsub float %tmp3, undef
+  %tmp70 = fsub float %tmp4, undef
+  %tmp71 = fmul float %tmp68, 0.000000e+00
+  %tmp72 = fmul float %tmp69, undef
+  %tmp73 = fmul float %tmp70, undef
+  %tmp74 = fsub float %tmp6, undef
+  %tmp75 = fsub float %tmp7, undef
+  %tmp76 = fmul float %tmp74, undef
+  %tmp77 = fmul float %tmp75, 0.000000e+00
+  %tmp78 = call float @llvm.minnum.f32(float %tmp73, float %tmp77)
+  %tmp79 = call float @llvm.maxnum.f32(float %tmp71, float 0.000000e+00)
+  %tmp80 = call float @llvm.maxnum.f32(float %tmp72, float %tmp76)
+  %tmp81 = call float @llvm.maxnum.f32(float undef, float %tmp78)
+  %tmp82 = call float @llvm.minnum.f32(float %tmp79, float %tmp80)
+  %tmp83 = call float @llvm.minnum.f32(float %tmp82, float undef)
+  %tmp84 = fsub float %tmp14, undef
+  %tmp85 = fsub float %tmp15, undef
+  %tmp86 = fsub float %tmp16, undef
+  %tmp87 = fmul float %tmp84, undef
+  %tmp88 = fmul float %tmp85, undef
+  %tmp89 = fmul float %tmp86, undef
+  %tmp90 = fsub float %tmp17, undef
+  %tmp91 = fsub float %tmp18, undef
+  %tmp92 = fsub float %tmp19, undef
+  %tmp93 = fmul float %tmp90, 0.000000e+00
+  %tmp94 = fmul float %tmp91, undef
+  %tmp95 = fmul float %tmp92, undef
+  %tmp96 = call float @llvm.minnum.f32(float %tmp88, float %tmp94)
+  %tmp97 = call float @llvm.maxnum.f32(float %tmp87, float %tmp93)
+  %tmp98 = call float @llvm.maxnum.f32(float %tmp89, float %tmp95)
+  %tmp99 = call float @llvm.maxnum.f32(float undef, float %tmp96)
+  %tmp100 = call float @llvm.maxnum.f32(float %tmp99, float undef)
+  %tmp101 = call float @llvm.minnum.f32(float %tmp97, float undef)
+  %tmp102 = call float @llvm.minnum.f32(float %tmp101, float %tmp98)
+  %tmp103 = fsub float %tmp30, undef
+  %tmp104 = fsub float %tmp31, undef
+  %tmp105 = fmul float %tmp103, 0.000000e+00
+  %tmp106 = fmul float %tmp104, 0.000000e+00
+  %tmp107 = call float @llvm.minnum.f32(float undef, float %tmp105)
+  %tmp108 = call float @llvm.maxnum.f32(float undef, float %tmp106)
+  %tmp109 = call float @llvm.maxnum.f32(float undef, float %tmp107)
+  %tmp110 = call float @llvm.maxnum.f32(float %tmp109, float undef)
+  %tmp111 = call float @llvm.minnum.f32(float undef, float %tmp108)
+  %tmp112 = fsub float %tmp32, undef
+  %tmp113 = fsub float %tmp33, undef
+  %tmp114 = fsub float %tmp34, undef
+  %tmp115 = fmul float %tmp112, 0.000000e+00
+  %tmp116 = fmul float %tmp113, undef
+  %tmp117 = fmul float %tmp114, undef
+  %tmp118 = fsub float %tmp35, undef
+  %tmp119 = fsub float %tmp36, undef
+  %tmp120 = fsub float %tmp37, undef
+  %tmp121 = fmul float %tmp118, undef
+  %tmp122 = fmul float %tmp119, undef
+  %tmp123 = fmul float %tmp120, undef
+  %tmp124 = call float @llvm.minnum.f32(float %tmp115, float %tmp121)
+  %tmp125 = call float @llvm.minnum.f32(float %tmp116, float %tmp122)
+  %tmp126 = call float @llvm.minnum.f32(float %tmp117, float %tmp123)
+  %tmp127 = call float @llvm.maxnum.f32(float %tmp124, float %tmp125)
+  %tmp128 = call float @llvm.maxnum.f32(float %tmp127, float %tmp126)
+  %tmp129 = fsub float %tmp38, undef
+  %tmp130 = fsub float %tmp39, undef
+  %tmp131 = fsub float %tmp40, undef
+  %tmp132 = fmul float %tmp129, 0.000000e+00
+  %tmp133 = fmul float %tmp130, undef
+  %tmp134 = fmul float %tmp131, undef
+  %tmp135 = fsub float %tmp41, undef
+  %tmp136 = fsub float %tmp42, undef
+  %tmp137 = fsub float %tmp43, undef
+  %tmp138 = fmul float %tmp135, undef
+  %tmp139 = fmul float %tmp136, undef
+  %tmp140 = fmul float %tmp137, undef
+  %tmp141 = call float @llvm.minnum.f32(float %tmp132, float %tmp138)
+  %tmp142 = call float @llvm.minnum.f32(float %tmp133, float %tmp139)
+  %tmp143 = call float @llvm.minnum.f32(float %tmp134, float %tmp140)
+  %tmp144 = call float @llvm.maxnum.f32(float %tmp141, float %tmp142)
+  %tmp145 = call float @llvm.maxnum.f32(float %tmp144, float %tmp143)
+  %tmp146 = fsub float %tmp44, undef
+  %tmp147 = fsub float %tmp45, undef
+  %tmp148 = fsub float %tmp46, undef
+  %tmp149 = fmul float %tmp146, 0.000000e+00
+  %tmp150 = fmul float %tmp147, 0.000000e+00
+  %tmp151 = fmul float %tmp148, undef
+  %tmp152 = fsub float %tmp47, undef
+  %tmp153 = fsub float %tmp48, undef
+  %tmp154 = fsub float %tmp49, undef
+  %tmp155 = fmul float %tmp152, undef
+  %tmp156 = fmul float %tmp153, 0.000000e+00
+  %tmp157 = fmul float %tmp154, undef
+  %tmp158 = call float @llvm.minnum.f32(float %tmp149, float %tmp155)
+  %tmp159 = call float @llvm.minnum.f32(float %tmp150, float %tmp156)
+  %tmp160 = call float @llvm.minnum.f32(float %tmp151, float %tmp157)
+  %tmp161 = call float @llvm.maxnum.f32(float %tmp158, float %tmp159)
+  %tmp162 = call float @llvm.maxnum.f32(float %tmp161, float %tmp160)
+  %tmp163 = fsub float %tmp50, undef
+  %tmp164 = fsub float %tmp51, undef
+  %tmp165 = fsub float %tmp52, undef
+  %tmp166 = fmul float %tmp163, undef
+  %tmp167 = fmul float %tmp164, 0.000000e+00
+  %tmp168 = fmul float %tmp165, 0.000000e+00
+  %tmp169 = fsub float %tmp53, undef
+  %tmp170 = fsub float %tmp54, undef
+  %tmp171 = fsub float %tmp55, undef
+  %tmp172 = fdiv float 1.000000e+00, %temp18.0
+  %tmp173 = fmul float %tmp169, undef
+  %tmp174 = fmul float %tmp170, undef
+  %tmp175 = fmul float %tmp171, %tmp172
+  %tmp176 = call float @llvm.minnum.f32(float %tmp166, float %tmp173)
+  %tmp177 = call float @llvm.minnum.f32(float %tmp167, float %tmp174)
+  %tmp178 = call float @llvm.minnum.f32(float %tmp168, float %tmp175)
+  %tmp179 = call float @llvm.maxnum.f32(float %tmp176, float %tmp177)
+  %tmp180 = call float @llvm.maxnum.f32(float %tmp179, float %tmp178)
+  %tmp181 = fsub float %tmp62, undef
+  %tmp182 = fsub float %tmp63, undef
+  %tmp183 = fsub float %tmp64, undef
+  %tmp184 = fmul float %tmp181, 0.000000e+00
+  %tmp185 = fmul float %tmp182, undef
+  %tmp186 = fmul float %tmp183, undef
+  %tmp187 = fsub float %tmp65, undef
+  %tmp188 = fsub float %tmp66, undef
+  %tmp189 = fmul float %tmp187, undef
+  %tmp190 = fmul float %tmp188, undef
+  %tmp191 = call float @llvm.maxnum.f32(float %tmp184, float %tmp189)
+  %tmp192 = call float @llvm.maxnum.f32(float %tmp185, float %tmp190)
+  %tmp193 = call float @llvm.maxnum.f32(float %tmp186, float undef)
+  %tmp194 = call float @llvm.minnum.f32(float %tmp191, float %tmp192)
+  %tmp195 = call float @llvm.minnum.f32(float %tmp194, float %tmp193)
+  %.temp292.7 = select i1 undef, float %tmp162, float undef
+  %temp292.9 = select i1 false, float %tmp180, float %.temp292.7
  %.temp292.9 = select i1 undef, float undef, float %temp292.9
-  %196 = fcmp ogt float undef, 0.000000e+00
-  %197 = fcmp olt float undef, %195
-  %198 = and i1 %196, %197
-  %199 = fcmp olt float undef, %.temp292.9
-  %200 = and i1 %198, %199
-  %temp292.11 = select i1 %200, float undef, float %.temp292.9
-  %tid0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+  %tmp196 = fcmp ogt float undef, 0.000000e+00
+  %tmp197 = fcmp olt float undef, %tmp195
+  %tmp198 = and i1 %tmp196, %tmp197
+  %tmp199 = fcmp olt float undef, %.temp292.9
+  %tmp200 = and i1 %tmp198, %tmp199
+  %temp292.11 = select i1 %tmp200, float undef, float %.temp292.9
+  %tid0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
  %cmp0 = icmp eq i32 %tid0, 0
  br i1 %cmp0, label %IF2565, label %ELSE2566

 IF2565:                                           ; preds = %ENDIF
-  %tid1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+  %tid1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
  %cmp1 = icmp eq i32 %tid1, 0
  br i1 %cmp1, label %ENDIF2582, label %ELSE2584

 ELSE2566:                                         ; preds = %ENDIF
-  %tid2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+  %tid2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
  %tidf = bitcast i32 %tid2 to float
-  %201 = fcmp oeq float %temp292.11, %tidf
-  br i1 %201, label %ENDLOOP, label %ELSE2593
+  %tmp201 = fcmp oeq float %temp292.11, %tidf
+  br i1 %tmp201, label %ENDLOOP, label %ELSE2593

 ENDIF2564:                                        ; preds = %ENDIF2594, %ENDIF2588
  %temp894.1 = phi float [ undef, %ENDIF2588 ], [ %temp894.2, %ENDIF2594 ]
-  %temp18.1 = phi float [ %218, %ENDIF2588 ], [ undef, %ENDIF2594 ]
-  %202 = fsub float %5, undef
-  %203 = fmul float %202, undef
-  %204 = call float @llvm.maxnum.f32(float undef, float %203)
-  %205 = call float @llvm.minnum.f32(float %204, float undef)
-  %206 = call float @llvm.minnum.f32(float %205, float undef)
-  %207 = fcmp ogt float undef, 0.000000e+00
-  %208 = fcmp olt float undef, 1.000000e+00
-  %209 = and i1 %207, %208
-  %tid3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+  %temp18.1 = phi float [ %tmp218, %ENDIF2588 ], [ undef, %ENDIF2594 ]
+  %tmp202 = fsub float %tmp5, undef
+  %tmp203 = fmul float %tmp202, undef
+  %tmp204 = call float @llvm.maxnum.f32(float undef, float %tmp203)
+  %tmp205 = call float @llvm.minnum.f32(float %tmp204, float undef)
+  %tmp206 = call float @llvm.minnum.f32(float %tmp205, float undef)
+  %tmp207 = fcmp ogt float undef, 0.000000e+00
+  %tmp208 = fcmp olt float undef, 1.000000e+00
+  %tmp209 = and i1 %tmp207, %tmp208
+  %tid3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
  %tidf3 = bitcast i32 %tid3 to float
-  %210 = fcmp olt float %tidf3, %206
-  %211 = and i1 %209, %210
-  br i1 %211, label %ENDIF2795, label %ELSE2797
+  %tmp210 = fcmp olt float %tidf3, %tmp206
+  %tmp211 = and i1 %tmp209, %tmp210
+  br i1 %tmp211, label %ENDIF2795, label %ELSE2797

 ELSE2584:                                         ; preds = %IF2565
  br label %ENDIF2582

 ENDIF2582:                                        ; preds = %ELSE2584, %IF2565
-  %212 = fadd float %1, undef
-  %213 = fadd float 0.000000e+00, %212
-  %floor = call float @llvm.floor.f32(float %213)
-  %214 = fsub float %213, %floor
-  %tid4 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+  %tmp212 = fadd float %tmp1, undef
+  %tmp213 = fadd float 0.000000e+00, %tmp212
+  %floor = call float @llvm.floor.f32(float %tmp213)
+  %tmp214 = fsub float %tmp213, %floor
+  %tid4 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
  %cmp4 = icmp eq i32 %tid4, 0
  br i1 %cmp4, label %IF2589, label %ELSE2590

@ -280,61 +281,61 @@ ELSE2590:                                         ; preds = %ENDIF2582
  br label %ENDIF2588

 ENDIF2588:                                        ; preds = %ELSE2590, %IF2589
-  %215 = fsub float 1.000000e+00, %214
-  %216 = call float @llvm.sqrt.f32(float %215)
-  %217 = fmul float %216, undef
-  %218 = fadd float %217, undef
+  %tmp215 = fsub float 1.000000e+00, %tmp214
+  %tmp216 = call float @llvm.sqrt.f32(float %tmp215)
+  %tmp217 = fmul float %tmp216, undef
+  %tmp218 = fadd float %tmp217, undef
  br label %ENDIF2564

 ELSE2593:                                         ; preds = %ELSE2566
-  %219 = fcmp oeq float %temp292.11, %81
-  %220 = fcmp olt float %81, %83
-  %221 = and i1 %219, %220
-  br i1 %221, label %ENDIF2594, label %ELSE2596
+  %tmp219 = fcmp oeq float %temp292.11, %tmp81
+  %tmp220 = fcmp olt float %tmp81, %tmp83
+  %tmp221 = and i1 %tmp219, %tmp220
+  br i1 %tmp221, label %ENDIF2594, label %ELSE2596

 ELSE2596:                                         ; preds = %ELSE2593
-  %222 = fcmp oeq float %temp292.11, %100
-  %223 = fcmp olt float %100, %102
-  %224 = and i1 %222, %223
-  br i1 %224, label %ENDIF2594, label %ELSE2632
+  %tmp222 = fcmp oeq float %temp292.11, %tmp100
+  %tmp223 = fcmp olt float %tmp100, %tmp102
+  %tmp224 = and i1 %tmp222, %tmp223
+  br i1 %tmp224, label %ENDIF2594, label %ELSE2632

 ENDIF2594:                                        ; preds = %ELSE2788, %ELSE2785, %ELSE2782, %ELSE2779, %IF2775, %ELSE2761, %ELSE2758, %IF2757, %ELSE2704, %ELSE2686, %ELSE2671, %ELSE2668, %IF2667, %ELSE2632, %ELSE2596, %ELSE2593
  %temp894.2 = phi float [ 0.000000e+00, %IF2667 ], [ 0.000000e+00, %ELSE2671 ], [ 0.000000e+00, %IF2757 ], [ 0.000000e+00, %ELSE2761 ], [ %temp894.0, %ELSE2758 ], [ 0.000000e+00, %IF2775 ], [ 0.000000e+00, %ELSE2779 ], [ 0.000000e+00, %ELSE2782 ], [ %.2848, %ELSE2788 ], [ 0.000000e+00, %ELSE2785 ], [ 0.000000e+00, %ELSE2593 ], [ 0.000000e+00, %ELSE2632 ], [ 0.000000e+00, %ELSE2704 ], [ 0.000000e+00, %ELSE2686 ], [ 0.000000e+00, %ELSE2668 ], [ 0.000000e+00, %ELSE2596 ]
-  %225 = fmul float %temp894.2, undef
+  %tmp225 = fmul float %temp894.2, undef
  br label %ENDIF2564

 ELSE2632:                                         ; preds = %ELSE2596
  br i1 undef, label %ENDIF2594, label %ELSE2650

 ELSE2650:                                         ; preds = %ELSE2632
-  %226 = fcmp oeq float %temp292.11, %110
-  %227 = fcmp olt float %110, %111
-  %228 = and i1 %226, %227
-  br i1 %228, label %IF2667, label %ELSE2668
+  %tmp226 = fcmp oeq float %temp292.11, %tmp110
+  %tmp227 = fcmp olt float %tmp110, %tmp111
+  %tmp228 = and i1 %tmp226, %tmp227
+  br i1 %tmp228, label %IF2667, label %ELSE2668

 IF2667:                                           ; preds = %ELSE2650
  br i1 undef, label %ENDIF2594, label %ELSE2671

 ELSE2668:                                         ; preds = %ELSE2650
-  %229 = fcmp oeq float %temp292.11, %128
-  %230 = fcmp olt float %128, undef
-  %231 = and i1 %229, %230
-  br i1 %231, label %ENDIF2594, label %ELSE2686
+  %tmp229 = fcmp oeq float %temp292.11, %tmp128
+  %tmp230 = fcmp olt float %tmp128, undef
+  %tmp231 = and i1 %tmp229, %tmp230
+  br i1 %tmp231, label %ENDIF2594, label %ELSE2686

 ELSE2671:                                         ; preds = %IF2667
  br label %ENDIF2594

 ELSE2686:                                         ; preds = %ELSE2668
-  %232 = fcmp oeq float %temp292.11, %145
-  %233 = fcmp olt float %145, undef
-  %234 = and i1 %232, %233
-  br i1 %234, label %ENDIF2594, label %ELSE2704
+  %tmp232 = fcmp oeq float %temp292.11, %tmp145
+  %tmp233 = fcmp olt float %tmp145, undef
+  %tmp234 = and i1 %tmp232, %tmp233
+  br i1 %tmp234, label %ENDIF2594, label %ELSE2704

 ELSE2704:                                         ; preds = %ELSE2686
-  %235 = fcmp oeq float %temp292.11, %180
-  %236 = fcmp olt float %180, undef
-  %237 = and i1 %235, %236
-  br i1 %237, label %ENDIF2594, label %ELSE2740
+  %tmp235 = fcmp oeq float %temp292.11, %tmp180
+  %tmp236 = fcmp olt float %tmp180, undef
+  %tmp237 = and i1 %tmp235, %tmp236
+  br i1 %tmp237, label %ENDIF2594, label %ELSE2740

 ELSE2740:                                         ; preds = %ELSE2704
  br i1 undef, label %IF2757, label %ELSE2758
@ -349,8 +350,8 @@ ELSE2761:                                         ; preds = %IF2757
  br label %ENDIF2594

 IF2775:                                           ; preds = %ELSE2758
-  %238 = fcmp olt float undef, undef
-  br i1 %238, label %ENDIF2594, label %ELSE2779
+  %tmp238 = fcmp olt float undef, undef
+  br i1 %tmp238, label %ENDIF2594, label %ELSE2779

 ELSE2779:                                         ; preds = %IF2775
  br i1 undef, label %ENDIF2594, label %ELSE2782
@ -359,39 +360,39 @@ ELSE2782:                                         ; preds = %ELSE2779
  br i1 undef, label %ENDIF2594, label %ELSE2785

 ELSE2785:                                         ; preds = %ELSE2782
-  %239 = fcmp olt float undef, 0.000000e+00
-  br i1 %239, label %ENDIF2594, label %ELSE2788
+  %tmp239 = fcmp olt float undef, 0.000000e+00
+  br i1 %tmp239, label %ENDIF2594, label %ELSE2788

 ELSE2788:                                         ; preds = %ELSE2785
-  %240 = fcmp olt float 0.000000e+00, undef
-  %.2848 = select i1 %240, float -1.000000e+00, float 1.000000e+00
+  %tmp240 = fcmp olt float 0.000000e+00, undef
+  %.2848 = select i1 %tmp240, float -1.000000e+00, float 1.000000e+00
  br label %ENDIF2594

 ELSE2797:                                         ; preds = %ENDIF2564
-  %241 = fsub float %8, undef
-  %242 = fsub float %9, undef
-  %243 = fsub float %10, undef
-  %244 = fmul float %241, undef
-  %245 = fmul float %242, undef
-  %246 = fmul float %243, undef
-  %247 = fsub float %11, undef
-  %248 = fsub float %12, undef
-  %249 = fsub float %13, undef
-  %250 = fmul float %247, undef
-  %251 = fmul float %248, undef
-  %252 = fmul float %249, undef
-  %253 = call float @llvm.minnum.f32(float %244, float %250)
-  %254 = call float @llvm.minnum.f32(float %245, float %251)
-  %255 = call float @llvm.maxnum.f32(float %246, float %252)
-  %256 = call float @llvm.maxnum.f32(float %253, float %254)
-  %257 = call float @llvm.maxnum.f32(float %256, float undef)
-  %258 = call float @llvm.minnum.f32(float undef, float %255)
-  %259 = fcmp ogt float %257, 0.000000e+00
-  %260 = fcmp olt float %257, 1.000000e+00
-  %261 = and i1 %259, %260
-  %262 = fcmp olt float %257, %258
-  %263 = and i1 %261, %262
-  br i1 %263, label %ENDIF2795, label %ELSE2800
+  %tmp241 = fsub float %tmp8, undef
+  %tmp242 = fsub float %tmp9, undef
+  %tmp243 = fsub float %tmp10, undef
+  %tmp244 = fmul float %tmp241, undef
+  %tmp245 = fmul float %tmp242, undef
+  %tmp246 = fmul float %tmp243, undef
+  %tmp247 = fsub float %tmp11, undef
+  %tmp248 = fsub float %tmp12, undef
+  %tmp249 = fsub float %tmp13, undef
+  %tmp250 = fmul float %tmp247, undef
+  %tmp251 = fmul float %tmp248, undef
+  %tmp252 = fmul float %tmp249, undef
+  %tmp253 = call float @llvm.minnum.f32(float %tmp244, float %tmp250)
+  %tmp254 = call float @llvm.minnum.f32(float %tmp245, float %tmp251)
+  %tmp255 = call float @llvm.maxnum.f32(float %tmp246, float %tmp252)
+  %tmp256 = call float @llvm.maxnum.f32(float %tmp253, float %tmp254)
+  %tmp257 = call float @llvm.maxnum.f32(float %tmp256, float undef)
+  %tmp258 = call float @llvm.minnum.f32(float undef, float %tmp255)
+  %tmp259 = fcmp ogt float %tmp257, 0.000000e+00
+  %tmp260 = fcmp olt float %tmp257, 1.000000e+00
+  %tmp261 = and i1 %tmp259, %tmp260
+  %tmp262 = fcmp olt float %tmp257, %tmp258
+  %tmp263 = and i1 %tmp261, %tmp262
+  br i1 %tmp263, label %ENDIF2795, label %ELSE2800

 ENDIF2795:                                        ; preds = %ELSE2824, %ELSE2821, %ELSE2818, %ELSE2815, %ELSE2812, %ELSE2809, %ELSE2806, %ELSE2803, %ELSE2800, %ELSE2797, %ENDIF2564
  br label %LOOP
@ -400,53 +401,53 @@ ELSE2800:                                         ; preds = %ELSE2797
  br i1 undef, label %ENDIF2795, label %ELSE2803

 ELSE2803:                                         ; preds = %ELSE2800
-  %264 = fsub float %20, undef
-  %265 = fsub float %21, undef
-  %266 = fsub float %22, undef
-  %267 = fmul float %264, undef
-  %268 = fmul float %265, undef
-  %269 = fmul float %266, 0.000000e+00
-  %270 = fsub float %23, undef
-  %271 = fsub float %24, undef
-  %272 = fsub float %25, undef
-  %273 = fmul float %270, undef
-  %274 = fmul float %271, undef
-  %275 = fmul float %272, undef
-  %276 = call float @llvm.minnum.f32(float %267, float %273)
-  %277 = call float @llvm.maxnum.f32(float %268, float %274)
-  %278 = call float @llvm.maxnum.f32(float %269, float %275)
-  %279 = call float @llvm.maxnum.f32(float %276, float undef)
-  %280 = call float @llvm.maxnum.f32(float %279, float undef)
-  %281 = call float @llvm.minnum.f32(float undef, float %277)
-  %282 = call float @llvm.minnum.f32(float %281, float %278)
-  %283 = fcmp ogt float %280, 0.000000e+00
-  %284 = fcmp olt float %280, 1.000000e+00
-  %285 = and i1 %283, %284
-  %286 = fcmp olt float %280, %282
-  %287 = and i1 %285, %286
-  br i1 %287, label %ENDIF2795, label %ELSE2806
+  %tmp264 = fsub float %tmp20, undef
+  %tmp265 = fsub float %tmp21, undef
+  %tmp266 = fsub float %tmp22, undef
+  %tmp267 = fmul float %tmp264, undef
+  %tmp268 = fmul float %tmp265, undef
+  %tmp269 = fmul float %tmp266, 0.000000e+00
+  %tmp270 = fsub float %tmp23, undef
+  %tmp271 = fsub float %tmp24, undef
+  %tmp272 = fsub float %tmp25, undef
+  %tmp273 = fmul float %tmp270, undef
+  %tmp274 = fmul float %tmp271, undef
+  %tmp275 = fmul float %tmp272, undef
+  %tmp276 = call float @llvm.minnum.f32(float %tmp267, float %tmp273)
+  %tmp277 = call float @llvm.maxnum.f32(float %tmp268, float %tmp274)
+  %tmp278 = call float @llvm.maxnum.f32(float %tmp269, float %tmp275)
+  %tmp279 = call float @llvm.maxnum.f32(float %tmp276, float undef)
+  %tmp280 = call float @llvm.maxnum.f32(float %tmp279, float undef)
+  %tmp281 = call float @llvm.minnum.f32(float undef, float %tmp277)
+  %tmp282 = call float @llvm.minnum.f32(float %tmp281, float %tmp278)
+  %tmp283 = fcmp ogt float %tmp280, 0.000000e+00
+  %tmp284 = fcmp olt float %tmp280, 1.000000e+00
+  %tmp285 = and i1 %tmp283, %tmp284
+  %tmp286 = fcmp olt float %tmp280, %tmp282
+  %tmp287 = and i1 %tmp285, %tmp286
+  br i1 %tmp287, label %ENDIF2795, label %ELSE2806

 ELSE2806:                                         ; preds = %ELSE2803
-  %288 = fsub float %26, undef
-  %289 = fsub float %27, undef
-  %290 = fsub float %28, undef
-  %291 = fmul float %288, undef
-  %292 = fmul float %289, 0.000000e+00
-  %293 = fmul float %290, undef
-  %294 = fsub float %29, undef
-  %295 = fmul float %294, undef
-  %296 = call float @llvm.minnum.f32(float %291, float %295)
-  %297 = call float @llvm.minnum.f32(float %292, float undef)
-  %298 = call float @llvm.maxnum.f32(float %293, float undef)
-  %299 = call float @llvm.maxnum.f32(float %296, float %297)
-  %300 = call float @llvm.maxnum.f32(float %299, float undef)
-  %301 = call float @llvm.minnum.f32(float undef, float %298)
-  %302 = fcmp ogt float %300, 0.000000e+00
-  %303 = fcmp olt float %300, 1.000000e+00
-  %304 = and i1 %302, %303
-  %305 = fcmp olt float %300, %301
-  %306 = and i1 %304, %305
-  br i1 %306, label %ENDIF2795, label %ELSE2809
+  %tmp288 = fsub float %tmp26, undef
+  %tmp289 = fsub float %tmp27, undef
+  %tmp290 = fsub float %tmp28, undef
+  %tmp291 = fmul float %tmp288, undef
+  %tmp292 = fmul float %tmp289, 0.000000e+00
+  %tmp293 = fmul float %tmp290, undef
+  %tmp294 = fsub float %tmp29, undef
+  %tmp295 = fmul float %tmp294, undef
+  %tmp296 = call float @llvm.minnum.f32(float %tmp291, float %tmp295)
+  %tmp297 = call float @llvm.minnum.f32(float %tmp292, float undef)
+  %tmp298 = call float @llvm.maxnum.f32(float %tmp293, float undef)
+  %tmp299 = call float @llvm.maxnum.f32(float %tmp296, float %tmp297)
+  %tmp300 = call float @llvm.maxnum.f32(float %tmp299, float undef)
+  %tmp301 = call float @llvm.minnum.f32(float undef, float %tmp298)
+  %tmp302 = fcmp ogt float %tmp300, 0.000000e+00
+  %tmp303 = fcmp olt float %tmp300, 1.000000e+00
+  %tmp304 = and i1 %tmp302, %tmp303
+  %tmp305 = fcmp olt float %tmp300, %tmp301
+  %tmp306 = and i1 %tmp304, %tmp305
+  br i1 %tmp306, label %ENDIF2795, label %ELSE2809

 ELSE2809:                                         ; preds = %ELSE2806
  br i1 undef, label %ENDIF2795, label %ELSE2812
@ -461,53 +462,42 @@ ELSE2818:                                         ; preds = %ELSE2815
  br i1 undef, label %ENDIF2795, label %ELSE2821

 ELSE2821:                                         ; preds = %ELSE2818
-  %307 = fsub float %56, undef
-  %308 = fsub float %57, undef
-  %309 = fsub float %58, undef
-  %310 = fmul float %307, undef
-  %311 = fmul float %308, 0.000000e+00
-  %312 = fmul float %309, undef
-  %313 = fsub float %59, undef
-  %314 = fsub float %60, undef
-  %315 = fsub float %61, undef
-  %316 = fmul float %313, undef
-  %317 = fmul float %314, undef
-  %318 = fmul float %315, undef
-  %319 = call float @llvm.maxnum.f32(float %310, float %316)
-  %320 = call float @llvm.maxnum.f32(float %311, float %317)
-  %321 = call float @llvm.maxnum.f32(float %312, float %318)
-  %322 = call float @llvm.minnum.f32(float %319, float %320)
-  %323 = call float @llvm.minnum.f32(float %322, float %321)
-  %324 = fcmp ogt float undef, 0.000000e+00
-  %325 = fcmp olt float undef, 1.000000e+00
-  %326 = and i1 %324, %325
-  %327 = fcmp olt float undef, %323
-  %328 = and i1 %326, %327
-  br i1 %328, label %ENDIF2795, label %ELSE2824
+  %tmp307 = fsub float %tmp56, undef
+  %tmp308 = fsub float %tmp57, undef
+  %tmp309 = fsub float %tmp58, undef
+  %tmp310 = fmul float %tmp307, undef
+  %tmp311 = fmul float %tmp308, 0.000000e+00
+  %tmp312 = fmul float %tmp309, undef
+  %tmp313 = fsub float %tmp59, undef
+  %tmp314 = fsub float %tmp60, undef
+  %tmp315 = fsub float %tmp61, undef
+  %tmp316 = fmul float %tmp313, undef
+  %tmp317 = fmul float %tmp314, undef
+  %tmp318 = fmul float %tmp315, undef
+  %tmp319 = call float @llvm.maxnum.f32(float %tmp310, float %tmp316)
+  %tmp320 = call float @llvm.maxnum.f32(float %tmp311, float %tmp317)
+  %tmp321 = call float @llvm.maxnum.f32(float %tmp312, float %tmp318)
+  %tmp322 = call float @llvm.minnum.f32(float %tmp319, float %tmp320)
+  %tmp323 = call float @llvm.minnum.f32(float %tmp322, float %tmp321)
+  %tmp324 = fcmp ogt float undef, 0.000000e+00
+  %tmp325 = fcmp olt float undef, 1.000000e+00
+  %tmp326 = and i1 %tmp324, %tmp325
+  %tmp327 = fcmp olt float undef, %tmp323
+  %tmp328 = and i1 %tmp326, %tmp327
+  br i1 %tmp328, label %ENDIF2795, label %ELSE2824

 ELSE2824:                                         ; preds = %ELSE2821
  %.2849 = select i1 undef, float 0.000000e+00, float 1.000000e+00
  br label %ENDIF2795
 }

-declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-
-; Function Attrs: nounwind readnone
 declare float @llvm.floor.f32(float) #1
-
-; Function Attrs: nounwind readnone
 declare float @llvm.sqrt.f32(float) #1
-
-; Function Attrs: nounwind readnone
 declare float @llvm.minnum.f32(float, float) #1
-
-; Function Attrs: nounwind readnone
 declare float @llvm.maxnum.f32(float, float) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1

 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
--- a/test/CodeGen/AMDGPU/smrd.ll
+++ b/test/CodeGen/AMDGPU/smrd.ll
@ -1,16 +1,16 @@
-; RUN: llc < %s -march=amdgcn -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=SIVI %s
-; RUN: llc < %s -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=CI --check-prefix=GCN  %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=SIVI %s
+; RUN: llc -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SIVI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN  %s
+; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=SIVI %s

 ; SMRD load with an immediate offset.
 ; GCN-LABEL: {{^}}smrd0:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
-define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
+  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  store i32 %tmp1, i32 addrspace(1)* %out
  ret void
 }

@ -18,11 +18,11 @@ entry:
 ; GCN-LABEL: {{^}}smrd1:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
+  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  store i32 %tmp1, i32 addrspace(1)* %out
  ret void
 }

@ -33,11 +33,11 @@ entry:
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
 ; GCN: s_endpgm
-define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
+  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  store i32 %tmp1, i32 addrspace(1)* %out
  ret void
 }

@ -48,11 +48,11 @@ entry:
 ; SI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
 ; TODO: Add VI checks
 ; GCN: s_endpgm
-define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296
+  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  store i32 %tmp1, i32 addrspace(1)* %out
  ret void
 }

@ -62,11 +62,11 @@ entry:
 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
+  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  store i32 %tmp1, i32 addrspace(1)* %out
  ret void
 }

@ -76,11 +76,11 @@ entry:
 ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144
+  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  store i32 %tmp1, i32 addrspace(1)* %out
  ret void
 }

@ -88,12 +88,12 @@ entry:
 ; GCN-LABEL: {{^}}smrd_load_const0:
 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
-define amdgpu_ps void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
+define amdgpu_ps void @smrd_load_const0(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp
+  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 16)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
  ret void
 }

@ -102,14 +102,15 @@ main_body:
 ; GCN-LABEL: {{^}}smrd_load_const1:
 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define amdgpu_ps void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
+define amdgpu_ps void @smrd_load_const1(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1020)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp
+  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1020)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
  ret void
 }
+
 ; SMRD load using the load.const intrinsic with an offset greater than the
 ; largets possible immediate.
 ; immediate offset.
@ -118,12 +119,12 @@ main_body:
 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
-define amdgpu_ps void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
+define amdgpu_ps void @smrd_load_const2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1024)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp
+  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1024)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
  ret void
 }

@ -133,12 +134,12 @@ main_body:
 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define amdgpu_ps void @smrd_load_const3(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
+define amdgpu_ps void @smrd_load_const3(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1048572)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp
+  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1048572)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
  ret void
 }

@ -148,18 +149,17 @@ main_body:
 ; SIVI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define amdgpu_ps void @smrd_load_const4(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
+define amdgpu_ps void @smrd_load_const4(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1048576)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp
+  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1048576)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
  ret void
 }

-; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1

-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind readnone }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
--- a/test/CodeGen/AMDGPU/spill-m0.ll
+++ b/test/CodeGen/AMDGPU/spill-m0.ll
@ -107,7 +107,7 @@ endif:                                            ; preds = %else, %if
  %export = phi float [ %lds_data, %if ], [ %interp, %else ]
  %tmp4 = call i32 @llvm.SI.packf16(float %export, float %export)
  %tmp5 = bitcast i32 %tmp4 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp5, float %tmp5, float %tmp5, float %tmp5)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp5, float %tmp5, float %tmp5, float %tmp5, i1 true, i1 true) #0
  ret void
 }

@ -205,11 +205,9 @@ ret:
  ret void
 }

-declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #0
-
-declare i32 @llvm.SI.packf16(float, float) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare i32 @llvm.SI.packf16(float, float) #1

 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
--- a/test/CodeGen/AMDGPU/split-smrd.ll
+++ b/test/CodeGen/AMDGPU/split-smrd.ll
@ -1,11 +1,11 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s

 ; FIXME: Move this to sgpr-copy.ll when this is fixed on VI.
 ; Make sure that when we split an smrd instruction in order to move it to
 ; the VALU, we are also moving its users to the VALU.
-; CHECK-LABEL: {{^}}split_smrd_add_worklist:
-; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1

+; GCN-LABEL: {{^}}split_smrd_add_worklist:
+; GCN: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
 define amdgpu_ps void @split_smrd_add_worklist([34 x <8 x i32>] addrspace(2)* byval %arg) #0 {
 bb:
  %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 96)
@ -24,24 +24,20 @@ bb3:                                              ; preds = %bb
  %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
  %tmp10 = extractelement <4 x float> %tmp9, i32 0
  %tmp12 = call i32 @llvm.SI.packf16(float %tmp10, float undef)
-  %tmp13 = bitcast i32 %tmp12 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %tmp13, float undef, float undef)
+  %tmp13 = bitcast i32 %tmp12 to <2 x half>
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp13, <2 x half> undef, i1 true, i1 true) #0
  ret void
 }

-; Function Attrs: nounwind readnone
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
+
 declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
 declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
 declare i32 @llvm.SI.packf16(float, float) #1

 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }

 !0 = !{!1, !1, i64 0, i32 1}
-!1 = !{!"const", !3}
-!2 = !{!1, !1, i64 0}
-!3 = !{!"tbaa root"}
+!1 = !{!"const", !2}
+!2 = !{!"tbaa root"}
--- a/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
+++ b/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
@ -1,39 +1,37 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -o - %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s

-; SI-LABEL:{{^}}row_filter_C1_D0:
-; SI: s_endpgm
-; Function Attrs: nounwind
+; GCN-LABEL:{{^}}row_filter_C1_D0:
 define void @row_filter_C1_D0() {
 entry:
  br i1 undef, label %for.inc.1, label %do.body.preheader

 do.body.preheader:                                ; preds = %entry
-  %0 = insertelement <4 x i32> zeroinitializer, i32 undef, i32 1
+  %tmp = insertelement <4 x i32> zeroinitializer, i32 undef, i32 1
  br i1 undef, label %do.body56.1, label %do.body90

 do.body90:                                        ; preds = %do.body56.2, %do.body56.1, %do.body.preheader
-  %1 = phi <4 x i32> [ %6, %do.body56.2 ], [ %5, %do.body56.1 ], [ %0, %do.body.preheader ]
-  %2 = insertelement <4 x i32> %1, i32 undef, i32 2
-  %3 = insertelement <4 x i32> %2, i32 undef, i32 3
+  %tmp1 = phi <4 x i32> [ %tmp6, %do.body56.2 ], [ %tmp5, %do.body56.1 ], [ %tmp, %do.body.preheader ]
+  %tmp2 = insertelement <4 x i32> %tmp1, i32 undef, i32 2
+  %tmp3 = insertelement <4 x i32> %tmp2, i32 undef, i32 3
  br i1 undef, label %do.body124.1, label %do.body.1562.preheader

 do.body.1562.preheader:                           ; preds = %do.body124.1, %do.body90
-  %storemerge = phi <4 x i32> [ %3, %do.body90 ], [ %7, %do.body124.1 ]
-  %4 = insertelement <4 x i32> undef, i32 undef, i32 1
+  %storemerge = phi <4 x i32> [ %tmp3, %do.body90 ], [ %tmp7, %do.body124.1 ]
+  %tmp4 = insertelement <4 x i32> undef, i32 undef, i32 1
  br label %for.inc.1

 do.body56.1:                                      ; preds = %do.body.preheader
-  %5 = insertelement <4 x i32> %0, i32 undef, i32 1
+  %tmp5 = insertelement <4 x i32> %tmp, i32 undef, i32 1
  %or.cond472.1 = or i1 undef, undef
  br i1 %or.cond472.1, label %do.body56.2, label %do.body90

 do.body56.2:                                      ; preds = %do.body56.1
-  %6 = insertelement <4 x i32> %5, i32 undef, i32 1
+  %tmp6 = insertelement <4 x i32> %tmp5, i32 undef, i32 1
  br label %do.body90

 do.body124.1:                                     ; preds = %do.body90
-  %7 = insertelement <4 x i32> %3, i32 undef, i32 3
+  %tmp7 = insertelement <4 x i32> %tmp3, i32 undef, i32 3
  br label %do.body.1562.preheader

 for.inc.1:                                        ; preds = %do.body.1562.preheader, %entry
@ -42,8 +40,8 @@ for.inc.1:                                        ; preds = %do.body.1562.prehea
  unreachable
 }

-; SI-LABEL: {{^}}foo:
-; SI: s_endpgm
+; GCN-LABEL: {{^}}foo:
+; GCN: s_endpgm
 define amdgpu_ps void @foo() #0 {
 bb:
  br i1 undef, label %bb2, label %bb1
@ -78,9 +76,9 @@ bb13:                                             ; preds = %bb2
 bb14:                                             ; preds = %bb27, %bb24, %bb9
  %tmp15 = phi float [ %tmp12, %bb9 ], [ undef, %bb27 ], [ 0.000000e+00, %bb24 ]
  %tmp16 = phi float [ %tmp11, %bb9 ], [ undef, %bb27 ], [ %tmp25, %bb24 ]
-  %tmp17 = fmul float 10.5, %tmp16
-  %tmp18 = fmul float 11.5, %tmp15
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp18, float %tmp17, float %tmp17, float %tmp17)
+  %tmp17 = fmul float 1.050000e+01, %tmp16
+  %tmp18 = fmul float 1.150000e+01, %tmp15
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp18, float %tmp17, float %tmp17, float %tmp17, i1 true, i1 true) #0
  ret void

 bb23:                                             ; preds = %bb13
@ -97,13 +95,8 @@ bb27:                                             ; preds = %bb24
  br label %bb14
 }

-; Function Attrs: nounwind readnone
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1

-; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
--- a/test/CodeGen/AMDGPU/udiv.ll
+++ b/test/CodeGen/AMDGPU/udiv.ll
@ -5,17 +5,19 @@
 ; FUNC-LABEL: {{^}}udiv_i32:
 ; EG-NOT: SETGE_INT
 ; EG: CF_END
+
+; SI: v_rcp_iflag_f32_e32
 define void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1) * %in
-  %b = load i32, i32 addrspace(1) * %b_ptr
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
  %result = udiv i32 %a, %b
  store i32 %result, i32 addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}s_udiv_i32:
-
+; SI: v_rcp_iflag_f32_e32
 define void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
  %result = udiv i32 %a, %b
  store i32 %result, i32 addrspace(1)* %out
@ -30,6 +32,8 @@ define void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 ; FUNC-LABEL: {{^}}udiv_v2i32:
 ; EG: CF_END

+; SI: v_rcp_iflag_f32_e32
+; SI: v_rcp_iflag_f32_e32
 ; SI: s_endpgm
 define void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
@ -158,3 +162,21 @@ define void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %i
  store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16
  ret void
 }
+
+; FUNC-LABEL: {{^}}test_udiv2:
+; SI: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1
+define void @test_udiv2(i32 %p) {
+  %i = udiv i32 %p, 2
+  store volatile i32 %i, i32 addrspace(1)* undef
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_udiv_3_mulhu:
+; SI: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab
+; SI: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}}
+; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+define void @test_udiv_3_mulhu(i32 %p) {
+   %i = udiv i32 %p, 3
+   store volatile i32 %i, i32 addrspace(1)* undef
+   ret void
+}
--- a/test/CodeGen/AMDGPU/urecip.ll
+++ b/test/CodeGen/AMDGPU/urecip.ll
@ -1,13 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
-
-; CHECK: v_rcp_iflag_f32_e32
-
-define void @test(i32 %p, i32 %q) {
-   %i = udiv i32 %p, %q
-   %r = bitcast i32 %i to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
-   ret void
-}
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@ -179,39 +179,39 @@ bb24:                                             ; preds = %bb157, %bb
  br i1 %tmp155, label %bb156, label %bb157

 bb156:                                            ; preds = %bb24
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp12, float %tmp103, float %tmp102, float %tmp101)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 33, i32 0, float %tmp99, float %tmp98, float %tmp97, float %tmp95)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 34, i32 0, float %tmp94, float %tmp93, float %tmp91, float %tmp90)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 35, i32 0, float %tmp89, float %tmp87, float %tmp86, float %tmp85)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 36, i32 0, float %tmp83, float %tmp82, float %tmp81, float %tmp79)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 37, i32 0, float %tmp78, float %tmp77, float %tmp75, float %tmp74)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 38, i32 0, float %tmp73, float %tmp71, float %tmp70, float %tmp69)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 39, i32 0, float %tmp67, float %tmp66, float %tmp65, float %tmp63)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 40, i32 0, float %tmp62, float %tmp61, float %tmp59, float %tmp58)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 41, i32 0, float %tmp57, float %tmp55, float %tmp54, float %tmp53)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 42, i32 0, float %tmp51, float %tmp50, float %tmp49, float %tmp47)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 43, i32 0, float %tmp46, float %tmp45, float %tmp43, float %tmp42)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 44, i32 0, float %tmp41, float %tmp39, float %tmp38, float %tmp37)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 45, i32 0, float %tmp35, float %tmp34, float %tmp33, float %tmp31)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 46, i32 0, float %tmp30, float %tmp29, float %tmp27, float %tmp26)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 47, i32 0, float %tmp25, float %tmp28, float %tmp32, float %tmp36)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 48, i32 0, float %tmp40, float %tmp44, float %tmp48, float %tmp52)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 49, i32 0, float %tmp56, float %tmp60, float %tmp64, float %tmp68)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 50, i32 0, float %tmp72, float %tmp76, float %tmp80, float %tmp84)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 51, i32 0, float %tmp88, float %tmp92, float %tmp96, float %tmp100)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 52, i32 0, float %tmp104, float %tmp105, float %tmp106, float %tmp108)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 53, i32 0, float %tmp109, float %tmp110, float %tmp111, float %tmp112)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 54, i32 0, float %tmp113, float %tmp114, float %tmp115, float %tmp116)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 55, i32 0, float %tmp117, float %tmp118, float %tmp119, float %tmp120)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 56, i32 0, float %tmp121, float %tmp122, float %tmp123, float %tmp124)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 57, i32 0, float %tmp125, float %tmp126, float %tmp127, float %tmp128)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 58, i32 0, float %tmp129, float %tmp130, float %tmp131, float %tmp132)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 59, i32 0, float %tmp133, float %tmp134, float %tmp135, float %tmp136)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 60, i32 0, float %tmp137, float %tmp138, float %tmp139, float %tmp140)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 61, i32 0, float %tmp141, float %tmp142, float %tmp143, float %tmp144)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 62, i32 0, float %tmp145, float %tmp146, float %tmp147, float %tmp148)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 63, i32 0, float %tmp149, float %tmp150, float %tmp151, float %tmp13)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22)
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp12, float %tmp103, float %tmp102, float %tmp101, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 33, i32 15, float %tmp99, float %tmp98, float %tmp97, float %tmp95, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 34, i32 15, float %tmp94, float %tmp93, float %tmp91, float %tmp90, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 35, i32 15, float %tmp89, float %tmp87, float %tmp86, float %tmp85, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 36, i32 15, float %tmp83, float %tmp82, float %tmp81, float %tmp79, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 37, i32 15, float %tmp78, float %tmp77, float %tmp75, float %tmp74, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 38, i32 15, float %tmp73, float %tmp71, float %tmp70, float %tmp69, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 39, i32 15, float %tmp67, float %tmp66, float %tmp65, float %tmp63, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 40, i32 15, float %tmp62, float %tmp61, float %tmp59, float %tmp58, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 41, i32 15, float %tmp57, float %tmp55, float %tmp54, float %tmp53, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 42, i32 15, float %tmp51, float %tmp50, float %tmp49, float %tmp47, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 43, i32 15, float %tmp46, float %tmp45, float %tmp43, float %tmp42, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 44, i32 15, float %tmp41, float %tmp39, float %tmp38, float %tmp37, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 45, i32 15, float %tmp35, float %tmp34, float %tmp33, float %tmp31, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 46, i32 15, float %tmp30, float %tmp29, float %tmp27, float %tmp26, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 47, i32 15, float %tmp25, float %tmp28, float %tmp32, float %tmp36, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 48, i32 15, float %tmp40, float %tmp44, float %tmp48, float %tmp52, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 49, i32 15, float %tmp56, float %tmp60, float %tmp64, float %tmp68, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 50, i32 15, float %tmp72, float %tmp76, float %tmp80, float %tmp84, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 51, i32 15, float %tmp88, float %tmp92, float %tmp96, float %tmp100, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 52, i32 15, float %tmp104, float %tmp105, float %tmp106, float %tmp108, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 53, i32 15, float %tmp109, float %tmp110, float %tmp111, float %tmp112, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 54, i32 15, float %tmp113, float %tmp114, float %tmp115, float %tmp116, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 55, i32 15, float %tmp117, float %tmp118, float %tmp119, float %tmp120, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 56, i32 15, float %tmp121, float %tmp122, float %tmp123, float %tmp124, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 57, i32 15, float %tmp125, float %tmp126, float %tmp127, float %tmp128, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 58, i32 15, float %tmp129, float %tmp130, float %tmp131, float %tmp132, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 59, i32 15, float %tmp133, float %tmp134, float %tmp135, float %tmp136, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 60, i32 15, float %tmp137, float %tmp138, float %tmp139, float %tmp140, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 61, i32 15, float %tmp141, float %tmp142, float %tmp143, float %tmp144, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 62, i32 15, float %tmp145, float %tmp146, float %tmp147, float %tmp148, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 63, i32 15, float %tmp149, float %tmp150, float %tmp151, float %tmp13, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %tmp19, float %tmp20, float %tmp21, float %tmp22, i1 true, i1 false) #0
  ret void

 bb157:                                            ; preds = %bb24
@ -482,15 +482,11 @@ bb157:                                            ; preds = %bb24
  br label %bb24
 }

-; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1

 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
--- a/test/CodeGen/AMDGPU/wait.ll
+++ b/test/CodeGen/AMDGPU/wait.ll
@ -11,7 +11,7 @@
 ; DEFAULT: exp
 ; DEFAULT: s_waitcnt lgkmcnt(0)
 ; DEFAULT: s_endpgm
-define amdgpu_vs void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) {
+define amdgpu_vs void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 {
 main_body:
  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0
  %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
@ -20,8 +20,7 @@ main_body:
  %tmp13 = extractelement <4 x float> %tmp11, i32 1
  call void @llvm.amdgcn.s.barrier() #1
  %tmp14 = extractelement <4 x float> %tmp11, i32 2
-;  %tmp15 = extractelement <4 x float> %tmp11, i32 3
-  %tmp15 = load float, float addrspace(2)* %constptr, align 4 ; Force waiting for expcnt and lgkmcnt
+  %tmp15 = load float, float addrspace(2)* %constptr, align 4
  %tmp16 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 1
  %tmp17 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp16, !tbaa !0
  %tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp17, i32 0, i32 %arg6)
@ -29,8 +28,8 @@ main_body:
  %tmp20 = extractelement <4 x float> %tmp18, i32 1
  %tmp21 = extractelement <4 x float> %tmp18, i32 2
  %tmp22 = extractelement <4 x float> %tmp18, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp12, float %tmp13, float %tmp14, float %tmp15)
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp19, float %tmp20, float %tmp21, float %tmp22, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %tmp12, float %tmp13, float %tmp14, float %tmp15, i1 true, i1 false) #0
  ret void
 }

@ -44,40 +43,34 @@ main_body:
 ; ILPMAX: s_waitcnt vmcnt(1)
 ; ILPMAX: s_waitcnt vmcnt(0)
 ; ILPMAX: s_endpgm
-
-define amdgpu_vs void @main2([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)*
-byval, i32 inreg, i32 inreg, i32, i32, i32, i32) {
+define amdgpu_vs void @main2([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
 main_body:
-  %11 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 0
-  %12 = load <16 x i8>, <16 x i8> addrspace(2)* %11, align 16, !tbaa !0
-  %13 = add i32 %5, %7
-  %14 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %12, i32 0, i32 %13)
-  %15 = extractelement <4 x float> %14, i32 0
-  %16 = extractelement <4 x float> %14, i32 1
-  %17 = extractelement <4 x float> %14, i32 2
-  %18 = extractelement <4 x float> %14, i32 3
-  %19 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 1
-  %20 = load <16 x i8>, <16 x i8> addrspace(2)* %19, align 16, !tbaa !0
-  %21 = add i32 %5, %7
-  %22 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %20, i32 0, i32 %21)
-  %23 = extractelement <4 x float> %22, i32 0
-  %24 = extractelement <4 x float> %22, i32 1
-  %25 = extractelement <4 x float> %22, i32 2
-  %26 = extractelement <4 x float> %22, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %15, float %16, float %17, float %18)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %23, float %24, float %25, float %26)
+  %tmp = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %arg4, i64 0, i64 0
+  %tmp11 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, align 16, !tbaa !0
+  %tmp12 = add i32 %arg5, %arg7
+  %tmp13 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp11, i32 0, i32 %tmp12)
+  %tmp14 = extractelement <4 x float> %tmp13, i32 0
+  %tmp15 = extractelement <4 x float> %tmp13, i32 1
+  %tmp16 = extractelement <4 x float> %tmp13, i32 2
+  %tmp17 = extractelement <4 x float> %tmp13, i32 3
+  %tmp18 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %arg4, i64 0, i64 1
+  %tmp19 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp18, align 16, !tbaa !0
+  %tmp20 = add i32 %arg5, %arg7
+  %tmp21 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp19, i32 0, i32 %tmp20)
+  %tmp22 = extractelement <4 x float> %tmp21, i32 0
+  %tmp23 = extractelement <4 x float> %tmp21, i32 1
+  %tmp24 = extractelement <4 x float> %tmp21, i32 2
+  %tmp25 = extractelement <4 x float> %tmp21, i32 3
+  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %tmp14, float %tmp15, float %tmp16, float %tmp17, i1 true, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp22, float %tmp23, float %tmp24, float %tmp25, i1 false, i1 false) #0
  ret void
 }

-
-; Function Attrs: convergent nounwind
 declare void @llvm.amdgcn.s.barrier() #1
-
-; Function Attrs: nounwind readnone
 declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #2
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0

-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
+attributes #0 = { nounwind }
 attributes #1 = { convergent nounwind }
 attributes #2 = { nounwind readnone }

--- a/test/CodeGen/AMDGPU/wqm.ll
+++ b/test/CodeGen/AMDGPU/wqm.ll
@ -1,5 +1,5 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=SI
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=VI
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=VI %s

 ; Check that WQM isn't triggered by image load/store intrinsics.
 ;
@ -25,9 +25,7 @@ main_body:
  %c.3 = extractelement <4 x i32> %c.2, i32 0
  %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3
  %data = load float, float addrspace(1)* %gep
-
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %data, float undef, float undef, float undef)
-
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %data, float undef, float undef, float undef, i1 true, i1 true) #1
  ret void
 }

@ -500,7 +498,7 @@ end:
  ret <4 x float> %r
 }

-
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
 declare void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
 declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1
 declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
@ -512,8 +510,7 @@ declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i3
 declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
 declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3

-declare void @llvm.AMDGPU.kill(float)
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare void @llvm.AMDGPU.kill(float) #1

 attributes #1 = { nounwind }
 attributes #2 = { nounwind readonly }
--- a/test/Transforms/StructurizeCFG/rebuild-ssa-infinite-loop.ll
+++ b/test/Transforms/StructurizeCFG/rebuild-ssa-infinite-loop.ll
@ -6,46 +6,51 @@

 target triple = "amdgcn--"

-declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #0
-declare <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #2
-
-define amdgpu_vs void @wrapper(i32 inreg, i32) {
+define amdgpu_vs void @wrapper(i32 inreg %arg, i32 %arg1) {
 main_body:
-  %2 = add i32 %1, %0
-  %3 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> undef, i32 0, i32 %2)
-  %4 = extractelement <4 x float> %3, i32 1
-  %5 = fptosi float %4 to i32
-  %6 = insertelement <2 x i32> undef, i32 %5, i32 1
+  %tmp = add i32 %arg1, %arg
+  %tmp2 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> undef, i32 0, i32 %tmp)
+  %tmp3 = extractelement <4 x float> %tmp2, i32 1
+  %tmp4 = fptosi float %tmp3 to i32
+  %tmp5 = insertelement <2 x i32> undef, i32 %tmp4, i32 1
  br label %loop11.i

 loop11.i:                                         ; preds = %endif46.i, %main_body
-  %7 = phi i32 [ 0, %main_body ], [ %15, %endif46.i ]
-  %8 = icmp sgt i32 %7, 999
-  br i1 %8, label %main.exit, label %if16.i
+  %tmp6 = phi i32 [ 0, %main_body ], [ %tmp14, %endif46.i ]
+  %tmp7 = icmp sgt i32 %tmp6, 999
+  br i1 %tmp7, label %main.exit, label %if16.i

 if16.i:                                           ; preds = %loop11.i
-  %9 = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %6, <8 x i32> undef, i32 15, i1 true, i1 false, i1 false, i1 false)
-  %10 = extractelement <4 x float> %9, i32 0
-  %11 = fcmp ult float 0.000000e+00, %10
-  br i1 %11, label %if28.i, label %endif46.i
+  %tmp8 = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %tmp5, <8 x i32> undef, i32 15, i1 true, i1 false, i1 false, i1 false)
+  %tmp9 = extractelement <4 x float> %tmp8, i32 0
+  %tmp10 = fcmp ult float 0.000000e+00, %tmp9
+  br i1 %tmp10, label %if28.i, label %endif46.i

 if28.i:                                           ; preds = %if16.i
-  %12 = bitcast float %10 to i32
-  %13 = shl i32 %12, 16
-  %14 = bitcast i32 %13 to float
+  %tmp11 = bitcast float %tmp9 to i32
+  %tmp12 = shl i32 %tmp11, 16
+  %tmp13 = bitcast i32 %tmp12 to float
  br label %main.exit

 endif46.i:                                        ; preds = %if16.i
-  %15 = add i32 %7, 1
+  %tmp14 = add i32 %tmp6, 1
  br label %loop11.i

 main.exit:                                        ; preds = %if28.i, %loop11.i
-  %16 = phi float [ %14, %if28.i ], [ 0x36F0800000000000, %loop11.i ]
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %16, float 0.000000e+00, float 0.000000e+00, float 0x36A0000000000000)
+  %tmp15 = phi float [ %tmp13, %if28.i ], [ 0x36F0800000000000, %loop11.i ]
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp15, float 0.000000e+00, float 0.000000e+00, float 0x36A0000000000000, i1 false, i1 false) #0
  ret void
 }

-attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind readonly }
-attributes #2 = { nounwind }
+; Function Attrs: nounwind
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1
+
+; Function Attrs: nounwind readonly
+declare <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }