AMDGPU: Mark all unspecified CC functions in tests as amdgpu_kernel

Currently the default C calling convention functions are treated the same as compute kernels. Make this explicit so the default calling convention can be changed to a non-kernel. Converted with perl -pi -e 's/define void/define amdgpu_kernel void/' on the relevant test directories (and undoing in one place that actually wanted a non-kernel). llvm-svn: 298444
2024-10-18 18:42:46 +02:00 · 2017-03-21 21:39:51 +00:00 · 2017-03-21 21:39:51 +00:00 · dd9ab77318
commit dd9ab77318
parent 5a93187aed
714 changed files with 6248 additions and 6248 deletions
--- a/test/Analysis/CostModel/AMDGPU/add-sub.ll
+++ b/test/Analysis/CostModel/AMDGPU/add-sub.ll
@ -3,7 +3,7 @@

 ; CHECK: 'add_i32'
 ; CHECK: estimated cost of 1 for {{.*}} add i32
-define void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
  %vec = load i32, i32 addrspace(1)* %vaddr
  %add = add i32 %vec, %b
  store i32 %add, i32 addrspace(1)* %out
@ -12,7 +12,7 @@ define void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #

 ; CHECK: 'add_v2i32'
 ; CHECK: estimated cost of 2 for {{.*}} add <2 x i32>
-define void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
+define amdgpu_kernel void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
  %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
  %add = add <2 x i32> %vec, %b
  store <2 x i32> %add, <2 x i32> addrspace(1)* %out
@ -21,7 +21,7 @@ define void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %va

 ; CHECK: 'add_v3i32'
 ; CHECK: estimated cost of 3 for {{.*}} add <3 x i32>
-define void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
+define amdgpu_kernel void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
  %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
  %add = add <3 x i32> %vec, %b
  store <3 x i32> %add, <3 x i32> addrspace(1)* %out
@ -30,7 +30,7 @@ define void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %va

 ; CHECK: 'add_v4i32'
 ; CHECK: estimated cost of 4 for {{.*}} add <4 x i32>
-define void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
+define amdgpu_kernel void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
  %add = add <4 x i32> %vec, %b
  store <4 x i32> %add, <4 x i32> addrspace(1)* %out
@ -39,7 +39,7 @@ define void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %va

 ; CHECK: 'add_i64'
 ; CHECK: estimated cost of 2 for {{.*}} add i64
-define void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
  %vec = load i64, i64 addrspace(1)* %vaddr
  %add = add i64 %vec, %b
  store i64 %add, i64 addrspace(1)* %out
@ -48,7 +48,7 @@ define void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #

 ; CHECK: 'add_v2i64'
 ; CHECK: estimated cost of 4 for {{.*}} add <2 x i64>
-define void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
+define amdgpu_kernel void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
  %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
  %add = add <2 x i64> %vec, %b
  store <2 x i64> %add, <2 x i64> addrspace(1)* %out
@ -57,7 +57,7 @@ define void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %va

 ; CHECK: 'add_v3i64'
 ; CHECK: estimated cost of 6 for {{.*}} add <3 x i64>
-define void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
+define amdgpu_kernel void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
  %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
  %add = add <3 x i64> %vec, %b
  store <3 x i64> %add, <3 x i64> addrspace(1)* %out
@ -66,7 +66,7 @@ define void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %va

 ; CHECK: 'add_v4i64'
 ; CHECK: estimated cost of 8 for {{.*}} add <4 x i64>
-define void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
+define amdgpu_kernel void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
  %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
  %add = add <4 x i64> %vec, %b
  store <4 x i64> %add, <4 x i64> addrspace(1)* %out
@ -75,7 +75,7 @@ define void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %va

 ; CHECK: 'add_v16i64'
 ; CHECK: estimated cost of 32 for {{.*}} add <16 x i64>
-define void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %vaddr, <16 x i64> %b) #0 {
+define amdgpu_kernel void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %vaddr, <16 x i64> %b) #0 {
  %vec = load <16 x i64>, <16 x i64> addrspace(1)* %vaddr
  %add = add <16 x i64> %vec, %b
  store <16 x i64> %add, <16 x i64> addrspace(1)* %out
@ -84,7 +84,7 @@ define void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)*

 ; CHECK: 'add_i16'
 ; CHECK: estimated cost of 1 for {{.*}} add i16
-define void @add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
+define amdgpu_kernel void @add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
  %vec = load i16, i16 addrspace(1)* %vaddr
  %add = add i16 %vec, %b
  store i16 %add, i16 addrspace(1)* %out
@ -93,7 +93,7 @@ define void @add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #

 ; CHECK: 'add_v2i16'
 ; CHECK: estimated cost of 2 for {{.*}} add <2 x i16>
-define void @add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
+define amdgpu_kernel void @add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
  %add = add <2 x i16> %vec, %b
  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
@ -102,7 +102,7 @@ define void @add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %va

 ; CHECK: 'sub_i32'
 ; CHECK: estimated cost of 1 for {{.*}} sub i32
-define void @sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
  %vec = load i32, i32 addrspace(1)* %vaddr
  %sub = sub i32 %vec, %b
  store i32 %sub, i32 addrspace(1)* %out
@ -111,7 +111,7 @@ define void @sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #

 ; CHECK: 'sub_i64'
 ; CHECK: estimated cost of 2 for {{.*}} sub i64
-define void @sub_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @sub_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
  %vec = load i64, i64 addrspace(1)* %vaddr
  %sub = sub i64 %vec, %b
  store i64 %sub, i64 addrspace(1)* %out
@ -119,7 +119,7 @@ define void @sub_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #
 }
 ; CHECK: 'sub_i16'
 ; CHECK: estimated cost of 1 for {{.*}} sub i16
-define void @sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
+define amdgpu_kernel void @sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
  %vec = load i16, i16 addrspace(1)* %vaddr
  %sub = sub i16 %vec, %b
  store i16 %sub, i16 addrspace(1)* %out
@ -128,7 +128,7 @@ define void @sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #

 ; CHECK: 'sub_v2i16'
 ; CHECK: estimated cost of 2 for {{.*}} sub <2 x i16>
-define void @sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
+define amdgpu_kernel void @sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
  %sub = sub <2 x i16> %vec, %b
  store <2 x i16> %sub, <2 x i16> addrspace(1)* %out
--- a/test/Analysis/CostModel/AMDGPU/bit-ops.ll
+++ b/test/Analysis/CostModel/AMDGPU/bit-ops.ll
@ -2,7 +2,7 @@

 ; CHECK: 'or_i32'
 ; CHECK: estimated cost of 1 for {{.*}} or i32
-define void @or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
  %vec = load i32, i32 addrspace(1)* %vaddr
  %or = or i32 %vec, %b
  store i32 %or, i32 addrspace(1)* %out
@ -11,7 +11,7 @@ define void @or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0

 ; CHECK: 'or_i64'
 ; CHECK: estimated cost of 2 for {{.*}} or i64
-define void @or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
  %vec = load i64, i64 addrspace(1)* %vaddr
  %or = or i64 %vec, %b
  store i64 %or, i64 addrspace(1)* %out
@ -20,7 +20,7 @@ define void @or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0

 ; CHECK: 'xor_i32'
 ; CHECK: estimated cost of 1 for {{.*}} xor i32
-define void @xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
  %vec = load i32, i32 addrspace(1)* %vaddr
  %or = xor i32 %vec, %b
  store i32 %or, i32 addrspace(1)* %out
@ -29,7 +29,7 @@ define void @xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #

 ; CHECK: 'xor_i64'
 ; CHECK: estimated cost of 2 for {{.*}} xor i64
-define void @xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
  %vec = load i64, i64 addrspace(1)* %vaddr
  %or = xor i64 %vec, %b
  store i64 %or, i64 addrspace(1)* %out
@ -39,7 +39,7 @@ define void @xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #

 ; CHECK: 'and_i32'
 ; CHECK: estimated cost of 1 for {{.*}} and i32
-define void @and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
  %vec = load i32, i32 addrspace(1)* %vaddr
  %or = and i32 %vec, %b
  store i32 %or, i32 addrspace(1)* %out
@ -48,7 +48,7 @@ define void @and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #

 ; CHECK: 'and_i64'
 ; CHECK: estimated cost of 2 for {{.*}} and i64
-define void @and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
  %vec = load i64, i64 addrspace(1)* %vaddr
  %or = and i64 %vec, %b
  store i64 %or, i64 addrspace(1)* %out
--- a/test/Analysis/CostModel/AMDGPU/br.ll
+++ b/test/Analysis/CostModel/AMDGPU/br.ll
@ -4,7 +4,7 @@
 ; CHECK: estimated cost of 10 for instruction: br i1
 ; CHECK: estimated cost of 10 for instruction: br label
 ; CHECK: estimated cost of 10 for instruction: ret void
-define void @test_br_cost(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @test_br_cost(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
 bb0:
  br i1 undef, label %bb1, label %bb2

@ -21,7 +21,7 @@ bb2:

 ; CHECK: 'test_switch_cost'
 ; CHECK: Unknown cost for instruction:   switch
-define void @test_switch_cost(i32 %a) #0 {
+define amdgpu_kernel void @test_switch_cost(i32 %a) #0 {
 entry:
  switch i32 %a, label %default [
    i32 0, label %case0
--- a/test/Analysis/CostModel/AMDGPU/extractelement.ll
+++ b/test/Analysis/CostModel/AMDGPU/extractelement.ll
@ -2,7 +2,7 @@

 ; CHECK: 'extractelement_v2i32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i32>
-define void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
  %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
  %elt = extractelement <2 x i32> %vec, i32 1
  store i32 %elt, i32 addrspace(1)* %out
@ -11,7 +11,7 @@ define void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)

 ; CHECK: 'extractelement_v2f32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x float>
-define void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) {
  %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
  %elt = extractelement <2 x float> %vec, i32 1
  store float %elt, float addrspace(1)* %out
@ -20,7 +20,7 @@ define void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspac

 ; CHECK: 'extractelement_v3i32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i32>
-define void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) {
  %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
  %elt = extractelement <3 x i32> %vec, i32 1
  store i32 %elt, i32 addrspace(1)* %out
@ -29,7 +29,7 @@ define void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)

 ; CHECK: 'extractelement_v4i32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i32>
-define void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) {
  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
  %elt = extractelement <4 x i32> %vec, i32 1
  store i32 %elt, i32 addrspace(1)* %out
@ -38,7 +38,7 @@ define void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)

 ; CHECK: 'extractelement_v8i32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i32>
-define void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) {
  %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
  %elt = extractelement <8 x i32> %vec, i32 1
  store i32 %elt, i32 addrspace(1)* %out
@ -48,7 +48,7 @@ define void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)
 ; FIXME: Should be non-0
 ; CHECK: 'extractelement_v8i32_dynindex'
 ; CHECK: estimated cost of 2 for {{.*}} extractelement <8 x i32>
-define void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) {
+define amdgpu_kernel void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) {
  %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
  %elt = extractelement <8 x i32> %vec, i32 %idx
  store i32 %elt, i32 addrspace(1)* %out
@ -57,7 +57,7 @@ define void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> add

 ; CHECK: 'extractelement_v2i64'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i64>
-define void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
  %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
  %elt = extractelement <2 x i64> %vec, i64 1
  store i64 %elt, i64 addrspace(1)* %out
@ -66,7 +66,7 @@ define void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)

 ; CHECK: 'extractelement_v3i64'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i64>
-define void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) {
  %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
  %elt = extractelement <3 x i64> %vec, i64 1
  store i64 %elt, i64 addrspace(1)* %out
@ -75,7 +75,7 @@ define void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)

 ; CHECK: 'extractelement_v4i64'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i64>
-define void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) {
  %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
  %elt = extractelement <4 x i64> %vec, i64 1
  store i64 %elt, i64 addrspace(1)* %out
@ -84,7 +84,7 @@ define void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)

 ; CHECK: 'extractelement_v8i64'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i64>
-define void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) {
  %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
  %elt = extractelement <8 x i64> %vec, i64 1
  store i64 %elt, i64 addrspace(1)* %out
@ -93,7 +93,7 @@ define void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)

 ; CHECK: 'extractelement_v4i8'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i8>
-define void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) {
  %vec = load <4 x i8>, <4 x i8> addrspace(1)* %vaddr
  %elt = extractelement <4 x i8> %vec, i8 1
  store i8 %elt, i8 addrspace(1)* %out
@ -102,7 +102,7 @@ define void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %

 ; CHECK: 'extractelement_v2i16'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i16>
-define void @extractelement_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
  %elt = extractelement <2 x i16> %vec, i16 1
  store i16 %elt, i16 addrspace(1)* %out
--- a/test/Analysis/CostModel/AMDGPU/fabs.ll
+++ b/test/Analysis/CostModel/AMDGPU/fabs.ll
@ -2,7 +2,7 @@

 ; CHECK: 'fabs_f32'
 ; CHECK: estimated cost of 0 for {{.*}} call float @llvm.fabs.f32
-define void @fabs_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
  %vec = load float, float addrspace(1)* %vaddr
  %fabs = call float @llvm.fabs.f32(float %vec) #1
  store float %fabs, float addrspace(1)* %out
@ -11,7 +11,7 @@ define void @fabs_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {

 ; CHECK: 'fabs_v2f32'
 ; CHECK: estimated cost of 0 for {{.*}} call <2 x float> @llvm.fabs.v2f32
-define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
  %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
  %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %vec) #1
  store <2 x float> %fabs, <2 x float> addrspace(1)* %out
@ -20,7 +20,7 @@ define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)

 ; CHECK: 'fabs_v3f32'
 ; CHECK: estimated cost of 0 for {{.*}} call <3 x float> @llvm.fabs.v3f32
-define void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
  %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
  %fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %vec) #1
  store <3 x float> %fabs, <3 x float> addrspace(1)* %out
@ -29,7 +29,7 @@ define void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)

 ; CHECK: 'fabs_f64'
 ; CHECK: estimated cost of 0 for {{.*}} call double @llvm.fabs.f64
-define void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
  %vec = load double, double addrspace(1)* %vaddr
  %fabs = call double @llvm.fabs.f64(double %vec) #1
  store double %fabs, double addrspace(1)* %out
@ -38,7 +38,7 @@ define void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0

 ; CHECK: 'fabs_v2f64'
 ; CHECK: estimated cost of 0 for {{.*}} call <2 x double> @llvm.fabs.v2f64
-define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {
  %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
  %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %vec) #1
  store <2 x double> %fabs, <2 x double> addrspace(1)* %out
@ -47,7 +47,7 @@ define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(

 ; CHECK: 'fabs_v3f64'
 ; CHECK: estimated cost of 0 for {{.*}} call <3 x double> @llvm.fabs.v3f64
-define void @fabs_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 {
  %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
  %fabs = call <3 x double> @llvm.fabs.v3f64(<3 x double> %vec) #1
  store <3 x double> %fabs, <3 x double> addrspace(1)* %out
@ -56,7 +56,7 @@ define void @fabs_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(

 ; CHECK: 'fabs_f16'
 ; CHECK: estimated cost of 0 for {{.*}} call half @llvm.fabs.f16
-define void @fabs_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
  %vec = load half, half addrspace(1)* %vaddr
  %fabs = call half @llvm.fabs.f16(half %vec) #1
  store half %fabs, half addrspace(1)* %out
@ -65,7 +65,7 @@ define void @fabs_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {

 ; CHECK: 'fabs_v2f16'
 ; CHECK: estimated cost of 0 for {{.*}} call <2 x half> @llvm.fabs.v2f16
-define void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
  %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %vec) #1
  store <2 x half> %fabs, <2 x half> addrspace(1)* %out
@ -74,7 +74,7 @@ define void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)*

 ; CHECK: 'fabs_v3f16'
 ; CHECK: estimated cost of 0 for {{.*}} call <3 x half> @llvm.fabs.v3f16
-define void @fabs_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 {
  %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
  %fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %vec) #1
  store <3 x half> %fabs, <3 x half> addrspace(1)* %out
--- a/test/Analysis/CostModel/AMDGPU/fadd.ll
+++ b/test/Analysis/CostModel/AMDGPU/fadd.ll
@ -3,7 +3,7 @@

 ; ALL: 'fadd_f32'
 ; ALL: estimated cost of 1 for {{.*}} fadd float
-define void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
+define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
  %vec = load float, float addrspace(1)* %vaddr
  %add = fadd float %vec, %b
  store float %add, float addrspace(1)* %out
@ -12,7 +12,7 @@ define void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, floa

 ; ALL: 'fadd_v2f32'
 ; ALL: estimated cost of 2 for {{.*}} fadd <2 x float>
-define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
+define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
  %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
  %add = fadd <2 x float> %vec, %b
  store <2 x float> %add, <2 x float> addrspace(1)* %out
@ -21,7 +21,7 @@ define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)

 ; ALL: 'fadd_v3f32'
 ; ALL: estimated cost of 3 for {{.*}} fadd <3 x float>
-define void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
+define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
  %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
  %add = fadd <3 x float> %vec, %b
  store <3 x float> %add, <3 x float> addrspace(1)* %out
@ -31,7 +31,7 @@ define void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)
 ; ALL: 'fadd_f64'
 ; FASTF64: estimated cost of 2 for {{.*}} fadd double
 ; SLOWF64: estimated cost of 3 for {{.*}} fadd double
-define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
+define amdgpu_kernel void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
  %vec = load double, double addrspace(1)* %vaddr
  %add = fadd double %vec, %b
  store double %add, double addrspace(1)* %out
@ -41,7 +41,7 @@ define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, do
 ; ALL: 'fadd_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fadd <2 x double>
 ; SLOWF64: estimated cost of 6 for {{.*}} fadd <2 x double>
-define void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
+define amdgpu_kernel void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
  %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
  %add = fadd <2 x double> %vec, %b
  store <2 x double> %add, <2 x double> addrspace(1)* %out
@ -51,7 +51,7 @@ define void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(
 ; ALL: 'fadd_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fadd <3 x double>
 ; SLOWF64: estimated cost of 9 for {{.*}} fadd <3 x double>
-define void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
+define amdgpu_kernel void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
  %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
  %add = fadd <3 x double> %vec, %b
  store <3 x double> %add, <3 x double> addrspace(1)* %out
@ -60,7 +60,7 @@ define void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(

 ; ALL 'fadd_f16'
 ; ALL estimated cost of 1 for {{.*}} fadd half
-define void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
+define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
  %vec = load half, half addrspace(1)* %vaddr
  %add = fadd half %vec, %b
  store half %add, half addrspace(1)* %out
@ -69,7 +69,7 @@ define void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %

 ; ALL 'fadd_v2f16'
 ; ALL estimated cost of 2 for {{.*}} fadd <2 x half>
-define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
+define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
  %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
  %add = fadd <2 x half> %vec, %b
  store <2 x half> %add, <2 x half> addrspace(1)* %out
@ -78,7 +78,7 @@ define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)*

 ; ALL 'fadd_v4f16'
 ; ALL estimated cost of 4 for {{.*}} fadd <4 x half>
-define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
+define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
  %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
  %add = fadd <4 x half> %vec, %b
  store <4 x half> %add, <4 x half> addrspace(1)* %out
--- a/test/Analysis/CostModel/AMDGPU/fdiv.ll
+++ b/test/Analysis/CostModel/AMDGPU/fdiv.ll
@ -5,7 +5,7 @@

 ; CHECK: 'fdiv_f32'
 ; ALL: estimated cost of 10 for {{.*}} fdiv float
-define void @fdiv_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
+define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
  %vec = load float, float addrspace(1)* %vaddr
  %add = fdiv float %vec, %b
  store float %add, float addrspace(1)* %out
@ -14,7 +14,7 @@ define void @fdiv_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, floa

 ; ALL: 'fdiv_v2f32'
 ; ALL: estimated cost of 20 for {{.*}} fdiv <2 x float>
-define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
  %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
  %add = fdiv <2 x float> %vec, %b
  store <2 x float> %add, <2 x float> addrspace(1)* %out
@ -23,7 +23,7 @@ define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)

 ; ALL: 'fdiv_v3f32'
 ; ALL: estimated cost of 30 for {{.*}} fdiv <3 x float>
-define void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
  %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
  %add = fdiv <3 x float> %vec, %b
  store <3 x float> %add, <3 x float> addrspace(1)* %out
@ -35,7 +35,7 @@ define void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)
 ; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double
 ; SIFASTF64: estimated cost of 32 for {{.*}} fdiv double
 ; SISLOWF64: estimated cost of 36 for {{.*}} fdiv double
-define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
+define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
  %vec = load double, double addrspace(1)* %vaddr
  %add = fdiv double %vec, %b
  store double %add, double addrspace(1)* %out
@ -47,7 +47,7 @@ define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, do
 ; CISLOWF64: estimated cost of 66 for {{.*}} fdiv <2 x double>
 ; SIFASTF64: estimated cost of 64 for {{.*}} fdiv <2 x double>
 ; SISLOWF64: estimated cost of 72 for {{.*}} fdiv <2 x double>
-define void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
  %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
  %add = fdiv <2 x double> %vec, %b
  store <2 x double> %add, <2 x double> addrspace(1)* %out
@ -59,7 +59,7 @@ define void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(
 ; CISLOWF64: estimated cost of 99 for {{.*}} fdiv <3 x double>
 ; SIFASTF64: estimated cost of 96 for {{.*}} fdiv <3 x double>
 ; SISLOWF64: estimated cost of 108 for {{.*}} fdiv <3 x double>
-define void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
+define amdgpu_kernel void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
  %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
  %add = fdiv <3 x double> %vec, %b
  store <3 x double> %add, <3 x double> addrspace(1)* %out
@ -68,7 +68,7 @@ define void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(

 ; ALL: 'fdiv_f16'
 ; ALL: estimated cost of 10 for {{.*}} fdiv half
-define void @fdiv_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
+define amdgpu_kernel void @fdiv_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
  %vec = load half, half addrspace(1)* %vaddr
  %add = fdiv half %vec, %b
  store half %add, half addrspace(1)* %out
@ -77,7 +77,7 @@ define void @fdiv_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %

 ; ALL: 'fdiv_v2f16'
 ; ALL: estimated cost of 20 for {{.*}} fdiv <2 x half>
-define void @fdiv_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
  %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
  %add = fdiv <2 x half> %vec, %b
  store <2 x half> %add, <2 x half> addrspace(1)* %out
@ -86,7 +86,7 @@ define void @fdiv_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)*

 ; ALL: 'fdiv_v4f16'
 ; ALL: estimated cost of 40 for {{.*}} fdiv <4 x half>
-define void @fdiv_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
+define amdgpu_kernel void @fdiv_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
  %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
  %add = fdiv <4 x half> %vec, %b
  store <4 x half> %add, <4 x half> addrspace(1)* %out
--- a/test/Analysis/CostModel/AMDGPU/fmul.ll
+++ b/test/Analysis/CostModel/AMDGPU/fmul.ll
@ -3,7 +3,7 @@

 ; ALL: 'fmul_f32'
 ; ALL: estimated cost of 1 for {{.*}} fmul float
-define void @fmul_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
+define amdgpu_kernel void @fmul_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
  %vec = load float, float addrspace(1)* %vaddr
  %add = fmul float %vec, %b
  store float %add, float addrspace(1)* %out
@ -12,7 +12,7 @@ define void @fmul_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, floa

 ; ALL: 'fmul_v2f32'
 ; ALL: estimated cost of 2 for {{.*}} fmul <2 x float>
-define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
+define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
  %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
  %add = fmul <2 x float> %vec, %b
  store <2 x float> %add, <2 x float> addrspace(1)* %out
@ -21,7 +21,7 @@ define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)

 ; ALL: 'fmul_v3f32'
 ; ALL: estimated cost of 3 for {{.*}} fmul <3 x float>
-define void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
+define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
  %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
  %add = fmul <3 x float> %vec, %b
  store <3 x float> %add, <3 x float> addrspace(1)* %out
@ -31,7 +31,7 @@ define void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)
 ; ALL: 'fmul_f64'
 ; FASTF64: estimated cost of 2 for {{.*}} fmul double
 ; SLOWF64: estimated cost of 3 for {{.*}} fmul double
-define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
+define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
  %vec = load double, double addrspace(1)* %vaddr
  %add = fmul double %vec, %b
  store double %add, double addrspace(1)* %out
@ -41,7 +41,7 @@ define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, do
 ; ALL: 'fmul_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fmul <2 x double>
 ; SLOWF64: estimated cost of 6 for {{.*}} fmul <2 x double>
-define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
+define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
  %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
  %add = fmul <2 x double> %vec, %b
  store <2 x double> %add, <2 x double> addrspace(1)* %out
@ -51,7 +51,7 @@ define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(
 ; ALL: 'fmul_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fmul <3 x double>
 ; SLOWF64: estimated cost of 9 for {{.*}} fmul <3 x double>
-define void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
+define amdgpu_kernel void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
  %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
  %add = fmul <3 x double> %vec, %b
  store <3 x double> %add, <3 x double> addrspace(1)* %out
@ -60,7 +60,7 @@ define void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(

 ; ALL 'fmul_f16'
 ; ALL estimated cost of 1 for {{.*}} fmul half
-define void @fmul_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
+define amdgpu_kernel void @fmul_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
  %vec = load half, half addrspace(1)* %vaddr
  %add = fmul half %vec, %b
  store half %add, half addrspace(1)* %out
@ -69,7 +69,7 @@ define void @fmul_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %

 ; ALL 'fmul_v2f16'
 ; ALL estimated cost of 2 for {{.*}} fmul <2 x half>
-define void @fmul_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
+define amdgpu_kernel void @fmul_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
  %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
  %add = fmul <2 x half> %vec, %b
  store <2 x half> %add, <2 x half> addrspace(1)* %out
@ -78,7 +78,7 @@ define void @fmul_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)*

 ; ALL 'fmul_v4f16'
 ; ALL estimated cost of 4 for {{.*}} fmul <4 x half>
-define void @fmul_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
+define amdgpu_kernel void @fmul_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
  %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
  %add = fmul <4 x half> %vec, %b
  store <4 x half> %add, <4 x half> addrspace(1)* %out
--- a/test/Analysis/CostModel/AMDGPU/fsub.ll
+++ b/test/Analysis/CostModel/AMDGPU/fsub.ll
@ -3,7 +3,7 @@

 ; ALL: 'fsub_f32'
 ; ALL: estimated cost of 1 for {{.*}} fsub float
-define void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
+define amdgpu_kernel void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
  %vec = load float, float addrspace(1)* %vaddr
  %add = fsub float %vec, %b
  store float %add, float addrspace(1)* %out
@ -12,7 +12,7 @@ define void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, floa

 ; ALL: 'fsub_v2f32'
 ; ALL: estimated cost of 2 for {{.*}} fsub <2 x float>
-define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
+define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
  %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
  %add = fsub <2 x float> %vec, %b
  store <2 x float> %add, <2 x float> addrspace(1)* %out
@ -21,7 +21,7 @@ define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)

 ; ALL: 'fsub_v3f32'
 ; ALL: estimated cost of 3 for {{.*}} fsub <3 x float>
-define void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
+define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
  %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
  %add = fsub <3 x float> %vec, %b
  store <3 x float> %add, <3 x float> addrspace(1)* %out
@ -31,7 +31,7 @@ define void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)
 ; ALL: 'fsub_f64'
 ; FASTF64: estimated cost of 2 for {{.*}} fsub double
 ; SLOWF64: estimated cost of 3 for {{.*}} fsub double
-define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
+define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
  %vec = load double, double addrspace(1)* %vaddr
  %add = fsub double %vec, %b
  store double %add, double addrspace(1)* %out
@ -41,7 +41,7 @@ define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, do
 ; ALL: 'fsub_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fsub <2 x double>
 ; SLOWF64: estimated cost of 6 for {{.*}} fsub <2 x double>
-define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
+define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
  %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
  %add = fsub <2 x double> %vec, %b
  store <2 x double> %add, <2 x double> addrspace(1)* %out
@ -51,7 +51,7 @@ define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(
 ; ALL: 'fsub_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fsub <3 x double>
 ; SLOWF64: estimated cost of 9 for {{.*}} fsub <3 x double>
-define void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
+define amdgpu_kernel void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
  %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
  %add = fsub <3 x double> %vec, %b
  store <3 x double> %add, <3 x double> addrspace(1)* %out
@ -60,7 +60,7 @@ define void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(

 ; ALL: 'fsub_f16'
 ; ALL: estimated cost of 1 for {{.*}} fsub half
-define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
+define amdgpu_kernel void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
  %vec = load half, half addrspace(1)* %vaddr
  %add = fsub half %vec, %b
  store half %add, half addrspace(1)* %out
@ -69,7 +69,7 @@ define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %

 ; ALL: 'fsub_v2f16'
 ; ALL: estimated cost of 2 for {{.*}} fsub <2 x half>
-define void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
+define amdgpu_kernel void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
  %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
  %add = fsub <2 x half> %vec, %b
  store <2 x half> %add, <2 x half> addrspace(1)* %out
@ -78,7 +78,7 @@ define void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)*

 ; ALL: 'fsub_v4f16'
 ; ALL: estimated cost of 4 for {{.*}} fsub <4 x half>
-define void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
+define amdgpu_kernel void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
  %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
  %add = fsub <4 x half> %vec, %b
  store <4 x half> %add, <4 x half> addrspace(1)* %out
--- a/test/Analysis/CostModel/AMDGPU/insertelement.ll
+++ b/test/Analysis/CostModel/AMDGPU/insertelement.ll
@ -2,7 +2,7 @@

 ; CHECK: 'insertelement_v2i32'
 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i32>
-define void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
  %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
  %insert = insertelement <2 x i32> %vec, i32 1, i32 123
  store <2 x i32> %insert, <2 x i32> addrspace(1)* %out
@ -11,7 +11,7 @@ define void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspa

 ; CHECK: 'insertelement_v2i64'
 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i64>
-define void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
  %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
  %insert = insertelement <2 x i64> %vec, i64 1, i64 123
  store <2 x i64> %insert, <2 x i64> addrspace(1)* %out
@ -20,7 +20,7 @@ define void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspa

 ; CHECK: 'insertelement_v2i16'
 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i16>
-define void @insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
  %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
  %insert = insertelement <2 x i16> %vec, i16 1, i16 123
  store <2 x i16> %insert, <2 x i16> addrspace(1)* %out
@ -29,7 +29,7 @@ define void @insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspa

 ; CHECK: 'insertelement_v2i8'
 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i8>
-define void @insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) {
  %vec = load <2 x i8>, <2 x i8> addrspace(1)* %vaddr
  %insert = insertelement <2 x i8> %vec, i8 1, i8 123
  store <2 x i8> %insert, <2 x i8> addrspace(1)* %out
--- a/test/Analysis/CostModel/AMDGPU/mul.ll
+++ b/test/Analysis/CostModel/AMDGPU/mul.ll
@ -2,7 +2,7 @@

 ; CHECK: 'mul_i32'
 ; CHECK: estimated cost of 3 for {{.*}} mul i32
-define void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
  %vec = load i32, i32 addrspace(1)* %vaddr
  %mul = mul i32 %vec, %b
  store i32 %mul, i32 addrspace(1)* %out
@ -11,7 +11,7 @@ define void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #

 ; CHECK: 'mul_v2i32'
 ; CHECK: estimated cost of 6 for {{.*}} mul <2 x i32>
-define void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
+define amdgpu_kernel void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
  %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
  %mul = mul <2 x i32> %vec, %b
  store <2 x i32> %mul, <2 x i32> addrspace(1)* %out
@ -20,7 +20,7 @@ define void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %va

 ; CHECK: 'mul_v3i32'
 ; CHECK: estimated cost of 9 for {{.*}} mul <3 x i32>
-define void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
+define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
  %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
  %mul = mul <3 x i32> %vec, %b
  store <3 x i32> %mul, <3 x i32> addrspace(1)* %out
@ -29,7 +29,7 @@ define void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %va

 ; CHECK: 'mul_v4i32'
 ; CHECK: estimated cost of 12 for {{.*}} mul <4 x i32>
-define void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
+define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
  %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
  %mul = mul <4 x i32> %vec, %b
  store <4 x i32> %mul, <4 x i32> addrspace(1)* %out
@ -38,7 +38,7 @@ define void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %va

 ; CHECK: 'mul_i64'
 ; CHECK: estimated cost of 16 for {{.*}} mul i64
-define void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
  %vec = load i64, i64 addrspace(1)* %vaddr
  %mul = mul i64 %vec, %b
  store i64 %mul, i64 addrspace(1)* %out
@ -47,7 +47,7 @@ define void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #

 ; CHECK: 'mul_v2i64'
 ; CHECK: estimated cost of 32 for {{.*}} mul <2 x i64>
-define void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
+define amdgpu_kernel void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
  %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
  %mul = mul <2 x i64> %vec, %b
  store <2 x i64> %mul, <2 x i64> addrspace(1)* %out
@ -56,7 +56,7 @@ define void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %va

 ; CHECK: 'mul_v3i64'
 ; CHECK: estimated cost of 48 for {{.*}} mul <3 x i64>
-define void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
+define amdgpu_kernel void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
  %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
  %mul = mul <3 x i64> %vec, %b
  store <3 x i64> %mul, <3 x i64> addrspace(1)* %out
@ -65,7 +65,7 @@ define void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %va

 ; CHECK: 'mul_v4i64'
 ; CHECK: estimated cost of 64 for {{.*}} mul <4 x i64>
-define void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
+define amdgpu_kernel void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
  %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
  %mul = mul <4 x i64> %vec, %b
  store <4 x i64> %mul, <4 x i64> addrspace(1)* %out
@ -75,7 +75,7 @@ define void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %va

 ; CHECK: 'mul_v8i64'
 ; CHECK: estimated cost of 128 for {{.*}} mul <8 x i64>
-define void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 {
+define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 {
  %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
  %mul = mul <8 x i64> %vec, %b
  store <8 x i64> %mul, <8 x i64> addrspace(1)* %out
--- a/test/Analysis/CostModel/AMDGPU/shifts.ll
+++ b/test/Analysis/CostModel/AMDGPU/shifts.ll
@ -3,7 +3,7 @@

 ; ALL: 'shl_i32'
 ; ALL: estimated cost of 1 for {{.*}} shl i32
-define void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
  %vec = load i32, i32 addrspace(1)* %vaddr
  %or = shl i32 %vec, %b
  store i32 %or, i32 addrspace(1)* %out
@ -13,7 +13,7 @@ define void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #
 ; ALL: 'shl_i64'
 ; FAST64: estimated cost of 2 for {{.*}} shl i64
 ; SLOW64: estimated cost of 3 for {{.*}} shl i64
-define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
  %vec = load i64, i64 addrspace(1)* %vaddr
  %or = shl i64 %vec, %b
  store i64 %or, i64 addrspace(1)* %out
@ -22,7 +22,7 @@ define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #

 ; ALL: 'lshr_i32'
 ; ALL: estimated cost of 1 for {{.*}} lshr i32
-define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
  %vec = load i32, i32 addrspace(1)* %vaddr
  %or = lshr i32 %vec, %b
  store i32 %or, i32 addrspace(1)* %out
@ -32,7 +32,7 @@ define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b)
 ; ALL: 'lshr_i64'
 ; FAST64: estimated cost of 2 for {{.*}} lshr i64
 ; SLOW64: estimated cost of 3 for {{.*}} lshr i64
-define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
  %vec = load i64, i64 addrspace(1)* %vaddr
  %or = lshr i64 %vec, %b
  store i64 %or, i64 addrspace(1)* %out
@ -41,7 +41,7 @@ define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b)

 ; ALL: 'ashr_i32'
 ; ALL: estimated cost of 1 for {{.*}} ashr i32
-define void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
  %vec = load i32, i32 addrspace(1)* %vaddr
  %or = ashr i32 %vec, %b
  store i32 %or, i32 addrspace(1)* %out
@ -51,7 +51,7 @@ define void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b)
 ; ALL: 'ashr_i64'
 ; FAST64: estimated cost of 2 for {{.*}} ashr i64
 ; SLOW64: estimated cost of 3 for {{.*}} ashr i64
-define void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
  %vec = load i64, i64 addrspace(1)* %vaddr
  %or = ashr i64 %vec, %b
  store i64 %or, i64 addrspace(1)* %out
--- a/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
@ -1,7 +1,7 @@
 ; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s

 ; CHECK: DIVERGENT: %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
-define void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 {
+define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 {
  %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
  store i32 %swizzle, i32 addrspace(1)* %out, align 4
  ret void
--- a/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
@ -5,7 +5,7 @@
 ; CHECK: DIVERGENT:  %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4

 ; The post dominator tree does not have a root node in this case
-define void @no_return_blocks(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1) #0 {
+define amdgpu_kernel void @no_return_blocks(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1) #0 {
 bb0:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
  %tmp2 = sext i32 %tmp to i64
--- a/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll
@ -1,7 +1,7 @@
 ; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s

 ; CHECK: DIVERGENT:  %tmp = cmpxchg volatile
-define void @unreachable_loop(i32 %tidx) #0 {
+define amdgpu_kernel void @unreachable_loop(i32 %tidx) #0 {
 entry:
  unreachable

--- a/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
@ -7,35 +7,35 @@ declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0

 ; CHECK: DIVERGENT:  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
-define void @workitem_id_x() #1 {
+define amdgpu_kernel void @workitem_id_x() #1 {
  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
  store volatile i32 %id.x, i32 addrspace(1)* undef
  ret void
 }

 ; CHECK: DIVERGENT:  %id.y = call i32 @llvm.amdgcn.workitem.id.y()
-define void @workitem_id_y() #1 {
+define amdgpu_kernel void @workitem_id_y() #1 {
  %id.y = call i32 @llvm.amdgcn.workitem.id.y()
  store volatile i32 %id.y, i32 addrspace(1)* undef
  ret void
 }

 ; CHECK: DIVERGENT:  %id.z = call i32 @llvm.amdgcn.workitem.id.z()
-define void @workitem_id_z() #1 {
+define amdgpu_kernel void @workitem_id_z() #1 {
  %id.z = call i32 @llvm.amdgcn.workitem.id.z()
  store volatile i32 %id.z, i32 addrspace(1)* undef
  ret void
 }

 ; CHECK: DIVERGENT:  %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0)
-define void @mbcnt_lo() #1 {
+define amdgpu_kernel void @mbcnt_lo() #1 {
  %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0)
  store volatile i32 %mbcnt.lo, i32 addrspace(1)* undef
  ret void
 }

 ; CHECK: DIVERGENT:  %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
-define void @mbcnt_hi() #1 {
+define amdgpu_kernel void @mbcnt_hi() #1 {
  %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
  store volatile i32 %mbcnt.hi, i32 addrspace(1)* undef
  ret void
--- a/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
+++ b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@ -13,7 +13,7 @@
 ; FUNC-LABEL: {{^}}local_address_load:
 ; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]]
 ; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
-define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
  %0 = load i32, i32 addrspace(3)* %in
  store i32 %0, i32 addrspace(1)* %out
@ -24,7 +24,7 @@ entry:
 ; SI: s_add_i32 [[SPTR:s[0-9]]]
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_read_b32 [[VPTR]]
-define void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
+define amdgpu_kernel void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
 entry:
  %0 = getelementptr i32, i32 addrspace(3)* %in, i32 %offset
  %1 = load i32, i32 addrspace(3)* %0
@ -35,7 +35,7 @@ entry:
 ; FUNC-LABEL: {{^}}local_address_gep_const_offset:
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
 ; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4
-define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
  %0 = getelementptr i32, i32 addrspace(3)* %in, i32 1
  %1 = load i32, i32 addrspace(3)* %0
@ -48,7 +48,7 @@ entry:
 ; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_read_b32 [[VPTR]]
-define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
  %0 = getelementptr i32, i32 addrspace(3)* %in, i32 16385
  %1 = load i32, i32 addrspace(3)* %0
@ -60,7 +60,7 @@ entry:
 ; SI: v_cmp_ne_u32
 ; SI-NOT: v_cmp_ne_u32
 ; SI: v_cndmask_b32
-define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
+define amdgpu_kernel void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
  %cmp = icmp ne i32 addrspace(3)* %lds, null
  %x = select i1 %cmp, i32 123, i32 456
  store i32 %x, i32 addrspace(1)* %out
@ -71,7 +71,7 @@ define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds)
 ; SI: s_mul_i32
 ; SI-NEXT: s_add_i32
 ; SI: ds_read_b32
-define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
+define amdgpu_kernel void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
  %ptr = getelementptr [3 x float], [3 x float] addrspace(3)* %lds, i32 %tid, i32 0
  %val = load float, float addrspace(3)* %ptr
  store float %val, float addrspace(1)* %out
@ -83,7 +83,7 @@ define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %
 ; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
 ; SI: ds_read_b32 v{{[0-9]+}}, [[REG]]
-define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
+define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
  %val = load float, float addrspace(3)* @g_lds
  store float %val, float addrspace(1)* %out
  ret void
@ -95,14 +95,14 @@ define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %ti

 ; FUNC-LABEL: {{^}}global_ptr:
 ; SI: ds_write_b32
-define void @global_ptr() nounwind {
+define amdgpu_kernel void @global_ptr() nounwind {
  store i32 addrspace(3)* getelementptr ([16383 x i32], [16383 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
  ret void
 }

 ; FUNC-LABEL: {{^}}local_address_store:
 ; SI: ds_write_b32
-define void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
+define amdgpu_kernel void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
  store i32 %val, i32 addrspace(3)* %out
  ret void
 }
@ -111,7 +111,7 @@ define void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
 ; SI: s_add_i32 [[SADDR:s[0-9]+]],
 ; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]]
 ; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}}
-define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
+define amdgpu_kernel void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 %offset
  store i32 %val, i32 addrspace(3)* %gep, align 4
  ret void
@ -121,7 +121,7 @@ define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
 ; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
 ; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4
-define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
+define amdgpu_kernel void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 1
  store i32 %val, i32 addrspace(3)* %gep, align 4
  ret void
@ -132,7 +132,7 @@ define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %v
 ; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}}
-define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
+define amdgpu_kernel void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
  %gep = getelementptr i32, i32 addrspace(3)* %out, i32 16385
  store i32 %val, i32 addrspace(3)* %gep, align 4
  ret void
--- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
@ -4,7 +4,7 @@
 # REQUIRES: global-isel

 --- |
-  define void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
+  define amdgpu_kernel void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
 ...
 ---

--- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
@ -5,7 +5,7 @@
 # REQUIRES: global-isel

 --- |
-  define void @smrd_imm(i32 addrspace(2)* %const0) { ret void }
+  define amdgpu_kernel void @smrd_imm(i32 addrspace(2)* %const0) { ret void }
 ...
 ---

--- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
@ -4,7 +4,7 @@
 # REQUIRES: global-isel

 --- |
-  define void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
+  define amdgpu_kernel void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
 ...
 ---

--- a/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
@ -3,12 +3,12 @@
 # REQUIRES: global-isel

 --- |
-  define void @load_constant(i32 addrspace(2)* %ptr0) { ret void }
-  define void @load_global_uniform(i32 addrspace(1)* %ptr1) {
+  define amdgpu_kernel void @load_constant(i32 addrspace(2)* %ptr0) { ret void }
+  define amdgpu_kernel void @load_global_uniform(i32 addrspace(1)* %ptr1) {
    %tmp0 = load i32, i32 addrspace(1)* %ptr1
    ret void
  }
-  define void @load_global_non_uniform(i32 addrspace(1)* %ptr2) {
+  define amdgpu_kernel void @load_global_non_uniform(i32 addrspace(1)* %ptr2) {
    %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0
    %tmp1 = getelementptr i32, i32 addrspace(1)* %ptr2, i32 %tmp0
    %tmp2 = load i32, i32 addrspace(1)* %tmp1
--- a/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
@ -9,7 +9,7 @@
 ; GCN-LABEL: {{^}}smrd0:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
-define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
  %1 = load i32, i32 addrspace(2)* %0
@ -21,7 +21,7 @@ entry:
 ; GCN-LABEL: {{^}}smrd1:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
  %1 = load i32, i32 addrspace(2)* %0
@ -36,7 +36,7 @@ entry:
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
 ; GCN: s_endpgm
-define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
  %1 = load i32, i32 addrspace(2)* %0
@ -51,7 +51,7 @@ entry:
 ; XSI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
 ; TODO: Add VI checks
 ; XGCN: s_endpgm
-define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
  %1 = load i32, i32 addrspace(2)* %0
@ -65,7 +65,7 @@ entry:
 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
  %1 = load i32, i32 addrspace(2)* %0
@ -79,7 +79,7 @@ entry:
 ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144
  %1 = load i32, i32 addrspace(2)* %0
--- a/test/CodeGen/AMDGPU/add-debug.ll
+++ b/test/CodeGen/AMDGPU/add-debug.ll
@ -3,7 +3,7 @@
 ; REQUIRES: asserts

 ; Check that SelectionDAGDumper does not crash on int_SI_if.
-define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
  %0 = icmp eq i64 %a, 0
  br i1 %0, label %if, label %else
--- a/test/CodeGen/AMDGPU/add.i16.ll
+++ b/test/CodeGen/AMDGPU/add.i16.ll
@ -6,7 +6,7 @@
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@ -23,7 +23,7 @@ define void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x7b, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@ -38,7 +38,7 @@ define void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xfffffcb3, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@ -53,7 +53,7 @@ define void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], -1, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@ -69,7 +69,7 @@ define void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: buffer_store_dword [[ADD]]
-define void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@ -89,7 +89,7 @@ define void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
 ; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@ -109,7 +109,7 @@ define void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]],  [[B]], [[A]]
 ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: buffer_store_dword [[SEXT]]
-define void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@ -130,7 +130,7 @@ define void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
  %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
--- a/test/CodeGen/AMDGPU/add.ll
+++ b/test/CodeGen/AMDGPU/add.ll
@ -8,7 +8,7 @@
 ;SI: v_add_i32_e32 [[REG:v[0-9]+]], vcc, {{v[0-9]+, v[0-9]+}}
 ;SI-NOT: [[REG]]
 ;SI: buffer_store_dword [[REG]],
-define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
  %a = load i32, i32 addrspace(1)* %in
  %b = load i32, i32 addrspace(1)* %b_ptr
@ -24,7 +24,7 @@ define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}

-define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
  %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
  %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@ -44,7 +44,7 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}

-define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
  %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
  %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@ -71,7 +71,7 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 ; SI: s_add_i32
 ; SI: s_add_i32
 ; SI: s_add_i32
-define void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
+define amdgpu_kernel void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
 entry:
  %0 = add <8 x i32> %a, %b
  store <8 x i32> %0, <8 x i32> addrspace(1)* %out
@ -112,7 +112,7 @@ entry:
 ; SI: s_add_i32
 ; SI: s_add_i32
 ; SI: s_add_i32
-define void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
+define amdgpu_kernel void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
 entry:
  %0 = add <16 x i32> %a, %b
  store <16 x i32> %0, <16 x i32> addrspace(1)* %out
@ -129,7 +129,7 @@ entry:
 ; EG-DAG: ADD_INT
 ; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-NOT: SUB
-define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
  %0 = add i64 %a, %b
  store i64 %0, i64 addrspace(1)* %out
@ -150,7 +150,7 @@ entry:
 ; EG-DAG: ADD_INT
 ; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-NOT: SUB
-define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
 entry:
  %0 = load i64, i64 addrspace(1)* %in
  %1 = add i64 %a, %0
@ -169,7 +169,7 @@ entry:
 ; EG-DAG: ADD_INT
 ; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-NOT: SUB
-define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
  %0 = icmp eq i64 %a, 0
  br i1 %0, label %if, label %else
--- a/test/CodeGen/AMDGPU/add.v2i16.ll
+++ b/test/CodeGen/AMDGPU/add.v2i16.ll
@ -7,7 +7,7 @@

 ; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@ -27,7 +27,7 @@ define void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(

 ; VI: s_add_i32
 ; VI: s_add_i32
-define void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
+define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
  %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
  %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1
  %add = add <2 x i16> %a, %b
@ -41,7 +41,7 @@ define void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(

 ; VI: s_add_i32
 ; VI: s_add_i32
-define void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
+define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
  %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
  %add = add <2 x i16> %a, %a
  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
@ -54,7 +54,7 @@ define void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrs

 ; VI: v_add_i32
 ; VI: v_add_i32
-define void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
+define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
  %add = add <2 x i16> %a, %b
  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
  ret void
@ -66,7 +66,7 @@ define void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a

 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}}
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x1c8, v{{[0-9]+}}
-define void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@ -83,7 +83,7 @@ define void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> a

 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}}
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffc21, v{{[0-9]+}}
-define void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@ -102,7 +102,7 @@ define void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i1
 ; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]]
 ; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
-define void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@ -121,7 +121,7 @@ define void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16
 ; VI-NOT: v_add_u16
 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
-define void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@ -141,7 +141,7 @@ define void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2
 ; VI-NOT: v_add_u16
 ; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
 ; VI: v_or_b32_e32
-define void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@ -173,7 +173,7 @@ define void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x
 ; VI-NOT: and
 ; VI-NOT: shl
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
-define void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@ -208,7 +208,7 @@ define void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
 ; VI: v_add_u16_e32

 ; VI: buffer_store_dwordx4
-define void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@ -236,7 +236,7 @@ define void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 ; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; VI: buffer_store_dwordx2
-define void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
@ -264,7 +264,7 @@ define void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
 ; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-define void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
--- a/test/CodeGen/AMDGPU/add_i128.ll
+++ b/test/CodeGen/AMDGPU/add_i128.ll
@ -6,7 +6,7 @@
 ; GCN-NEXT: v_addc_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 ; GCN-NEXT: v_addc_u32_e32 v[[HI:[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]],
-define void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %inA, i128 addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %inA, i128 addrspace(1)* noalias %inB) {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
  %a_ptr = getelementptr i128, i128 addrspace(1)* %inA, i32 %tid
  %b_ptr = getelementptr i128, i128 addrspace(1)* %inB, i32 %tid
@ -23,7 +23,7 @@ define void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)*
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
-define void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
+define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
  %foo = load i128, i128 addrspace(1)* %in, align 8
  %result = add i128 %foo, %a
  store i128 %result, i128 addrspace(1)* %out
@ -35,7 +35,7 @@ define void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* no
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
-define void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
+define amdgpu_kernel void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
  %foo = load i128, i128 addrspace(1)* %in, align 8
  %result = add i128 %a, %foo
  store i128 %result, i128 addrspace(1)* %out
@ -47,7 +47,7 @@ define void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspa
 ; GCN: s_addc_u32
 ; GCN: s_addc_u32
 ; GCN: s_addc_u32
-define void @test_sreg(i128 addrspace(1)* noalias %out, i128 %a, i128 %b) {
+define amdgpu_kernel void @test_sreg(i128 addrspace(1)* noalias %out, i128 %a, i128 %b) {
  %result = add i128 %a, %b
  store i128 %result, i128 addrspace(1)* %out
  ret void
--- a/test/CodeGen/AMDGPU/add_i64.ll
+++ b/test/CodeGen/AMDGPU/add_i64.ll
@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() readnone
 ; SI-LABEL: {{^}}test_i64_vreg:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
  %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
  %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
@ -21,7 +21,7 @@ define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noa
 ; SI-LABEL: {{^}}sgpr_operand:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
+define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
  %foo = load i64, i64 addrspace(1)* %in, align 8
  %result = add i64 %foo, %a
  store i64 %result, i64 addrspace(1)* %out
@ -34,7 +34,7 @@ define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noal
 ; SI-LABEL: {{^}}sgpr_operand_reversed:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
+define amdgpu_kernel void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
  %foo = load i64, i64 addrspace(1)* %in, align 8
  %result = add i64 %a, %foo
  store i64 %result, i64 addrspace(1)* %out
@ -47,7 +47,7 @@ define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace
 ; SI: s_addc_u32
 ; SI: s_add_u32
 ; SI: s_addc_u32
-define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
+define amdgpu_kernel void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
  %result = add <2 x i64> %a, %b
  store <2 x i64> %result, <2 x i64> addrspace(1)* %out
  ret void
@ -58,7 +58,7 @@ define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a,
 ; SI: v_addc_u32
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
  %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
  %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
@ -76,7 +76,7 @@ define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> add
 ; SI-NOT: addc
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
-define void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
  %add = add i64 %b, %a
  %trunc = trunc i64 %add to i32
  store i32 %trunc, i32 addrspace(1)* %out, align 8
--- a/test/CodeGen/AMDGPU/addrspacecast-captured.ll
+++ b/test/CodeGen/AMDGPU/addrspacecast-captured.ll
@ -9,7 +9,7 @@ declare void @consume_ptr2int(i32) #0
 ; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
 ; CHECK: %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32
 ; CHECK: store i32 %ptr2int, i32 addrspace(1)* %out
-define void @addrspacecast_captured(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @addrspacecast_captured(i32 addrspace(1)* %out) #0 {
 entry:
  %data = alloca i32, align 4
  %cast = addrspacecast i32* %data to i32 addrspace(4)*
@ -22,7 +22,7 @@ entry:
 ; CHECK: %data = alloca i32, align 4
 ; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
 ; CHECK: store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %out
-define void @addrspacecast_captured_store(i32 addrspace(4)* addrspace(1)* %out) #0 {
+define amdgpu_kernel void @addrspacecast_captured_store(i32 addrspace(4)* addrspace(1)* %out) #0 {
 entry:
  %data = alloca i32, align 4
  %cast = addrspacecast i32* %data to i32 addrspace(4)*
@ -35,7 +35,7 @@ entry:
 ; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
 ; CHECK: %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32
 ; CHECK: call void @consume_ptr2int(i32 %ptr2int)
-define void @addrspacecast_captured_call() #0 {
+define amdgpu_kernel void @addrspacecast_captured_call() #0 {
 entry:
  %data = alloca i32, align 4
  %cast = addrspacecast i32* %data to i32 addrspace(4)*
--- a/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@ -9,57 +9,57 @@ declare void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* nocapture, i32 addrs
@global.arr = unnamed_addr addrspace(1) global [256 x i32] undef, align 4

 ; HSA: @store_cast_0_flat_to_group_addrspacecast() #1
-define void @store_cast_0_flat_to_group_addrspacecast() #1 {
+define amdgpu_kernel void @store_cast_0_flat_to_group_addrspacecast() #1 {
  store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
  ret void
 }

 ; HSA: @store_cast_0_group_to_flat_addrspacecast() #2
-define void @store_cast_0_group_to_flat_addrspacecast() #1 {
+define amdgpu_kernel void @store_cast_0_group_to_flat_addrspacecast() #1 {
  store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*)
  ret void
 }

-; HSA: define void @store_constant_cast_group_gv_to_flat() #2
-define void @store_constant_cast_group_gv_to_flat() #1 {
+; HSA: define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #2
+define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #1 {
  store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds.i32 to i32 addrspace(4)*)
  ret void
 }

 ; HSA: @store_constant_cast_group_gv_gep_to_flat() #2
-define void @store_constant_cast_group_gv_gep_to_flat() #1 {
+define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat() #1 {
  store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
  ret void
 }

 ; HSA: @store_constant_cast_global_gv_to_flat() #1
-define void @store_constant_cast_global_gv_to_flat() #1 {
+define amdgpu_kernel void @store_constant_cast_global_gv_to_flat() #1 {
  store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global.i32 to i32 addrspace(4)*)
  ret void
 }

 ; HSA: @store_constant_cast_global_gv_gep_to_flat() #1
-define void @store_constant_cast_global_gv_gep_to_flat() #1 {
+define amdgpu_kernel void @store_constant_cast_global_gv_gep_to_flat() #1 {
  store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(1)* @global.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
  ret void
 }

 ; HSA: @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
  %val = load i32, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
  store i32 %val, i32 addrspace(1)* %out
  ret void
 }

 ; HSA: @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
  %val = atomicrmw add i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 1 seq_cst
  store i32 %val, i32 addrspace(1)* %out
  ret void
 }

 ; HSA: @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
  %val = cmpxchg i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst
  %val0 = extractvalue { i32, i1 } %val, 0
  store i32 %val0, i32 addrspace(1)* %out
@ -67,28 +67,28 @@ define void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out)
 }

 ; HSA: @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
  call void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* %out, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 32, i32 4, i1 false)
  ret void
 }

 ; Can't just search the pointer value
 ; HSA: @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #2
-define void @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #1 {
+define amdgpu_kernel void @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #1 {
  store i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 addrspace(4)* addrspace(1)* %out
  ret void
 }

 ; Can't just search pointer types
 ; HSA: @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #2
-define void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #1 {
  store i64 ptrtoint (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i64), i64 addrspace(1)* %out
  ret void
 }

 ; Cast group to flat, do GEP, cast back to group
 ; HSA: @store_constant_cast_group_gv_gep_to_flat_to_group() #2
-define void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 {
+define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 {
  store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i32 addrspace(3)*)
  ret void
 }
--- a/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/test/CodeGen/AMDGPU/addrspacecast.ll
@ -28,7 +28,7 @@

 ; CI: NumSgprs: {{[0-9][0-9]+}}
 ; GFX9: NumSgprs: {{[0-9]+}}
-define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
  %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
  store volatile i32 7, i32 addrspace(4)* %stof
  ret void
@ -58,7 +58,7 @@ define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {

 ; CI: NumSgprs: {{[0-9][0-9]+}}
 ; GFX9: NumSgprs: {{[0-9]+}}
-define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
+define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
  %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
  store volatile i32 7, i32 addrspace(4)* %stof
  ret void
@ -73,7 +73,7 @@ define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
-define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
  %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
  store volatile i32 7, i32 addrspace(4)* %stof
  ret void
@ -85,7 +85,7 @@ define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
 ; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
-define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
  %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
  %ld = load volatile i32, i32 addrspace(4)* %stof
  ret void
@ -102,7 +102,7 @@ define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
 ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
 ; HSA: ds_write_b32 [[CASTPTR]], v[[K]]
-define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
  %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
  store volatile i32 0, i32 addrspace(3)* %ftos
  ret void
@ -119,7 +119,7 @@ define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
 ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], 0, v[[VPTR_LO]]
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
 ; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
-define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
  %ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
  store volatile i32 0, i32* %ftos
  ret void
@ -133,7 +133,7 @@ define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0
 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
-define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
  %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
  store volatile i32 0, i32 addrspace(1)* %ftos
  ret void
@ -144,7 +144,7 @@ define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {

 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
 ; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0
-define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
  %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
  load volatile i32, i32 addrspace(2)* %ftos
  ret void
@ -158,7 +158,7 @@ define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
 ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
-define void @cast_0_group_to_flat_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 {
  %cast = addrspacecast i32 addrspace(3)* null to i32 addrspace(4)*
  store volatile i32 7, i32 addrspace(4)* %cast
  ret void
@ -168,7 +168,7 @@ define void @cast_0_group_to_flat_addrspacecast() #0 {
 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
 ; HSA: ds_write_b32 [[PTR]], [[K]]
-define void @cast_0_flat_to_group_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 {
  %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(3)*
  store volatile i32 7, i32 addrspace(3)* %cast
  ret void
@ -179,7 +179,7 @@ define void @cast_0_flat_to_group_addrspacecast() #0 {
 ; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
-define void @cast_neg1_group_to_flat_addrspacecast() #0 {
+define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 {
  %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32 addrspace(4)*
  store volatile i32 7, i32 addrspace(4)* %cast
  ret void
@ -189,7 +189,7 @@ define void @cast_neg1_group_to_flat_addrspacecast() #0 {
 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
 ; HSA: ds_write_b32 [[PTR]], [[K]]
-define void @cast_neg1_flat_to_group_addrspacecast() #0 {
+define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
  %cast = addrspacecast i32 addrspace(4)* inttoptr (i64 -1 to i32 addrspace(4)*) to i32 addrspace(3)*
  store volatile i32 7, i32 addrspace(3)* %cast
  ret void
@ -204,7 +204,7 @@ define void @cast_neg1_flat_to_group_addrspacecast() #0 {
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
-define void @cast_0_private_to_flat_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
  %cast = addrspacecast i32* null to i32 addrspace(4)*
  store volatile i32 7, i32 addrspace(4)* %cast
  ret void
@ -214,7 +214,7 @@ define void @cast_0_private_to_flat_addrspacecast() #0 {
 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
 ; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
-define void @cast_0_flat_to_private_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
  %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(0)*
  store volatile i32 7, i32* %cast
  ret void
@ -226,7 +226,7 @@ define void @cast_0_flat_to_private_addrspacecast() #0 {
 ; HSA-LABEL: {{^}}branch_use_flat_i32:
 ; HSA: flat_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}
 ; HSA: s_endpgm
-define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
+define amdgpu_kernel void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
 entry:
  %cmp = icmp ne i32 %c, 0
  br i1 %cmp, label %local, label %global
@ -259,7 +259,7 @@ end:
 ; HSA: flat_store_dword
 ; HSA: s_barrier
 ; HSA: flat_load_dword
-define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
+define amdgpu_kernel void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
  %alloca = alloca i32, i32 9, align 4
  %x = call i32 @llvm.amdgcn.workitem.id.x() #2
  %pptr = getelementptr i32, i32* %alloca, i32 %x
--- a/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
+++ b/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
@ -16,7 +16,7 @@ entry:

 ; FUNC-LABEL: {{^}}i8ptr_v16i8ptr:
 ; SI: s_endpgm
-define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
  %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)*
  %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0
@ -24,7 +24,7 @@ entry:
  ret void
 }

-define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
  %load = load float, float addrspace(1)* %in, align 4
  %fadd32 = fadd float %load, 1.0
  %bc = bitcast float %fadd32 to <2 x i16>
@ -33,7 +33,7 @@ define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in)
  ret void
 }

-define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
  %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
  %add.v2i16 = add <2 x i16> %load, <i16 2, i16 2>
  %bc = bitcast <2 x i16> %add.v2i16 to float
@ -42,7 +42,7 @@ define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in)
  ret void
 }

-define void @f32_to_v2f16(<2 x half> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @f32_to_v2f16(<2 x half> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
  %load = load float, float addrspace(1)* %in, align 4
  %fadd32 = fadd float %load, 1.0
  %bc = bitcast float %fadd32 to <2 x half>
@ -51,7 +51,7 @@ define void @f32_to_v2f16(<2 x half> addrspace(1)* %out, float addrspace(1)* %in
  ret void
 }

-define void @v2f16_to_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v2f16_to_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %in) nounwind {
  %load = load <2 x half>, <2 x half> addrspace(1)* %in, align 4
  %add.v2f16 = fadd <2 x half> %load, <half 2.0, half 2.0>
  %bc = bitcast <2 x half> %add.v2f16 to float
@ -60,14 +60,14 @@ define void @v2f16_to_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %in
  ret void
 }

-define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
  %bc = bitcast <4 x i8> %load to i32
  store i32 %bc, i32 addrspace(1)* %out, align 4
  ret void
 }

-define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
  %load = load i32, i32 addrspace(1)* %in, align 4
  %bc = bitcast i32 %load to <4 x i8>
  store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4
@ -76,7 +76,7 @@ define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nou

 ; FUNC-LABEL: {{^}}bitcast_v2i32_to_f64:
 ; SI: s_endpgm
-define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
  %add = add <2 x i32> %val, <i32 4, i32 9>
  %bc = bitcast <2 x i32> %add to double
@ -87,7 +87,7 @@ define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace

 ; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32:
 ; SI: s_endpgm
-define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
  %val = load double, double addrspace(1)* %in, align 8
  %add = fadd double %val, 4.0
  %bc = bitcast double %add to <2 x i32>
@ -96,7 +96,7 @@ define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace
 }

 ; FUNC-LABEL: {{^}}bitcast_v2i64_to_v2f64:
-define void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
+define amdgpu_kernel void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
 entry:
  %cmp0 = icmp eq i32 %cond, 0
  br i1 %cmp0, label %if, label %end
@ -112,7 +112,7 @@ end:
 }

 ; FUNC-LABEL: {{^}}bitcast_v2f64_to_v2i64:
-define void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
+define amdgpu_kernel void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
 entry:
  %cmp0 = icmp eq i32 %cond, 0
  br i1 %cmp0, label %if, label %end
--- a/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
+++ b/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
@ -15,7 +15,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; GCN-NOT: v0
 ; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, v0, v{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]]
-define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
 entry:
  %0 = alloca [2 x i32]
  %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0
--- a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
+++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@ -4,7 +4,7 @@

 ; NOOP-LABEL: @noop_fdiv_fpmath(
 ; NOOP: %md.25ulp = fdiv float %a, %b, !fpmath !0
-define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
+define amdgpu_kernel void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
  %md.25ulp = fdiv float %a, %b, !fpmath !0
  store volatile float %md.25ulp, float addrspace(1)* %out
  ret void
@ -18,7 +18,7 @@ define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
 ; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
 ; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
 ; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
+define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
  %no.md = fdiv float %a, %b
  store volatile float %no.md, float addrspace(1)* %out

@ -51,7 +51,7 @@ define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
 ; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0
 ; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}}
 ; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0
-define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
  %no.md = fdiv float 1.0, %x
  store volatile float %no.md, float addrspace(1)* %out

@ -89,7 +89,7 @@ define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
 ; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
 ; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0
 ; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1
-define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
+define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
  %no.md = fdiv <2 x float> %a, %b
  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out

@ -120,7 +120,7 @@ define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a,
 ; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
 ; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
 ; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
-define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
  %no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x
  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out

@ -158,7 +158,7 @@ define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float>
 ; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
 ; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0
 ; CHECK: store volatile <2 x float> %fast.25ulp
-define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
  %no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
  store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out

@ -186,7 +186,7 @@ define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2
 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
 ; CHECK: store volatile <2 x float> %fast.25ulp
-define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
  %x.insert = insertelement <2 x float> %x, float 1.0, i32 0

  %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
@ -206,7 +206,7 @@ define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %
 ; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
 ; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-define void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
+define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
  %no.md = fdiv float %a, %b
  store volatile float %no.md, float addrspace(1)* %out

--- a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@ -80,7 +80,7 @@
 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !0
 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !0
 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !0
-define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
  %stack = alloca [5 x i32], align 4
  %0 = load i32, i32 addrspace(1)* %in, align 4
@ -102,7 +102,7 @@ entry:

 ; OPT-LABEL: @high_alignment(
 ; OPT: getelementptr inbounds [256 x [8 x i32]], [256 x [8 x i32]] addrspace(3)* @high_alignment.stack, i32 0, i32 %{{[0-9]+}}
-define void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
  %stack = alloca [8 x i32], align 16
  %0 = load i32, i32 addrspace(1)* %in, align 4
@ -127,7 +127,7 @@ entry:
 ; OPT: alloca [5 x i32]

 ; SI-NOT: ds_write
-define void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
  %stack = alloca [5 x i32], align 4
  %0 = load i32, i32 addrspace(1)* %in, align 4
@ -162,7 +162,7 @@ entry:
 ; SI-NOT: v_movrel
 %struct.point = type { i32, i32 }

-define void @multiple_structs(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 {
 entry:
  %a = alloca %struct.point
  %b = alloca %struct.point
@ -191,7 +191,7 @@ entry:
 ; R600-NOT: MOVA_INT
 ; SI-NOT: v_movrel

-define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 entry:
  %prv_array_const = alloca [2 x i32]
  %prv_array = alloca [2 x i32]
@ -235,7 +235,7 @@ for.end:
 ; SI-PROMOTE: s_load_dword [[IDX:s[0-9]+]]
 ; SI-PROMOTE: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 16
 ; SI-PROMOTE: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[SCALED_IDX]], 16
-define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
  %0 = alloca [2 x i16]
  %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0
@ -258,7 +258,7 @@ entry:

 ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x60,0xe0
 ; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0
-define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
  %0 = alloca [2 x i8]
  %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0
@ -281,7 +281,7 @@ entry:
 ;
 ; A total of 5 bytes should be allocated and used.
 ; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ;
-define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
  %0 = alloca [3 x i8], align 1
  %1 = alloca [2 x i8], align 1
@ -305,7 +305,7 @@ entry:
  ret void
 }

-define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
  %alloca = alloca [2 x [2 x i8]]
  %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
@ -319,7 +319,7 @@ entry:
  ret void
 }

-define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
  %alloca = alloca [2 x [2 x i32]]
  %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
@ -332,7 +332,7 @@ entry:
  ret void
 }

-define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
 entry:
  %alloca = alloca [2 x [2 x i64]]
  %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
@ -347,7 +347,7 @@ entry:

 %struct.pair32 = type { i32, i32 }

-define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
  %alloca = alloca [2 x [2 x %struct.pair32]]
  %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
@ -360,7 +360,7 @@ entry:
  ret void
 }

-define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
  %alloca = alloca [2 x %struct.pair32]
  %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
@ -373,7 +373,7 @@ entry:
  ret void
 }

-define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
  %tmp = alloca [2 x i32]
  %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
@ -394,7 +394,7 @@ entry:
 ; SI-NOT: ds_write
 ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
 ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5 ;
-define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
  %alloca = alloca [16 x i32]
  %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
  store i32 5, i32* %tmp0
@ -410,7 +410,7 @@ define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; OPT-LABEL: @pointer_typed_alloca(
 ; OPT:  getelementptr inbounds [256 x i32 addrspace(1)*], [256 x i32 addrspace(1)*] addrspace(3)* @pointer_typed_alloca.A.addr, i32 0, i32 %{{[0-9]+}}
 ; OPT: load i32 addrspace(1)*, i32 addrspace(1)* addrspace(3)* %{{[0-9]+}}, align 4
-define void @pointer_typed_alloca(i32 addrspace(1)* %A) {
+define amdgpu_kernel void @pointer_typed_alloca(i32 addrspace(1)* %A) {
 entry:
  %A.addr = alloca i32 addrspace(1)*, align 4
  store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
@ -462,7 +462,7 @@ entry:
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword

-define void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) {
  %alloca = alloca [2 x <16 x i32>]
  %tmp0 = getelementptr [2 x <16 x i32>], [2 x <16 x i32>]* %alloca, i32 0, i32 %a
  %tmp5 = load <16 x i32>, <16 x i32>* %tmp0
@ -506,7 +506,7 @@ define void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) {
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword

-define void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) {
  %alloca = alloca [2 x <16 x float>]
  %tmp0 = getelementptr [2 x <16 x float>], [2 x <16 x float>]* %alloca, i32 0, i32 %a
  %tmp5 = load <16 x float>, <16 x float>* %tmp0
@ -522,7 +522,7 @@ define void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) {
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword

-define void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
  %alloca = alloca [16 x <2 x float>]
  %tmp0 = getelementptr [16 x <2 x float>], [16 x <2 x float>]* %alloca, i32 0, i32 %a
  %tmp5 = load <2 x float>, <2 x float>* %tmp0
@ -533,7 +533,7 @@ define void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
 ; OPT-LABEL: @direct_alloca_read_0xi32(
 ; OPT: store [0 x i32] undef, [0 x i32] addrspace(3)*
 ; OPT: load [0 x i32], [0 x i32] addrspace(3)*
-define void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) {
 entry:
  %tmp = alloca [0 x i32]
  store [0 x i32] [], [0 x i32]* %tmp
@ -545,7 +545,7 @@ entry:
 ; OPT-LABEL: @direct_alloca_read_1xi32(
 ; OPT: store [1 x i32] zeroinitializer, [1 x i32] addrspace(3)*
 ; OPT: load [1 x i32], [1 x i32] addrspace(3)*
-define void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) {
 entry:
  %tmp = alloca [1 x i32]
  store [1 x i32] [i32 0], [1 x i32]* %tmp
--- a/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
+++ b/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
@ -12,7 +12,7 @@

 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].X
-define void @ngroups_x (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ngroups_x (i32 addrspace(1)* %out) {
 entry:
  %0 = call i32 @llvm.r600.read.ngroups.x() #0
  store i32 %0, i32 addrspace(1)* %out
@ -27,7 +27,7 @@ entry:

 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].Y
-define void @ngroups_y (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ngroups_y (i32 addrspace(1)* %out) {
 entry:
  %0 = call i32 @llvm.r600.read.ngroups.y() #0
  store i32 %0, i32 addrspace(1)* %out
@ -42,7 +42,7 @@ entry:

 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].Z
-define void @ngroups_z (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ngroups_z (i32 addrspace(1)* %out) {
 entry:
  %0 = call i32 @llvm.r600.read.ngroups.z() #0
  store i32 %0, i32 addrspace(1)* %out
@ -57,7 +57,7 @@ entry:

 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].W
-define void @global_size_x (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @global_size_x (i32 addrspace(1)* %out) {
 entry:
  %0 = call i32 @llvm.r600.read.global.size.x() #0
  store i32 %0, i32 addrspace(1)* %out
@ -72,7 +72,7 @@ entry:

 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].X
-define void @global_size_y (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @global_size_y (i32 addrspace(1)* %out) {
 entry:
  %0 = call i32 @llvm.r600.read.global.size.y() #0
  store i32 %0, i32 addrspace(1)* %out
@ -87,7 +87,7 @@ entry:

 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].Y
-define void @global_size_z (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @global_size_z (i32 addrspace(1)* %out) {
 entry:
  %0 = call i32 @llvm.r600.read.global.size.z() #0
  store i32 %0, i32 addrspace(1)* %out
@ -102,7 +102,7 @@ entry:

 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].Z
-define void @local_size_x (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_x (i32 addrspace(1)* %out) {
 entry:
  %0 = call i32 @llvm.r600.read.local.size.x() #0
  store i32 %0, i32 addrspace(1)* %out
@ -117,7 +117,7 @@ entry:

 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].W
-define void @local_size_y (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_y (i32 addrspace(1)* %out) {
 entry:
  %0 = call i32 @llvm.r600.read.local.size.y() #0
  store i32 %0, i32 addrspace(1)* %out
@ -132,7 +132,7 @@ entry:

 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[2].X
-define void @local_size_z (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_z (i32 addrspace(1)* %out) {
 entry:
  %0 = call i32 @llvm.r600.read.local.size.z() #0
  store i32 %0, i32 addrspace(1)* %out
@ -153,7 +153,7 @@ entry:
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
 ; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @tgid_x_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_x_legacy(i32 addrspace(1)* %out) {
 entry:
  %0 = call i32 @llvm.r600.read.tgid.x() #0
  store i32 %0, i32 addrspace(1)* %out
@ -165,7 +165,7 @@ entry:
 ; GCN-NOHSA: buffer_store_dword [[VVAL]]

 ; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
-define void @tgid_y_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_y_legacy(i32 addrspace(1)* %out) {
 entry:
  %0 = call i32 @llvm.r600.read.tgid.y() #0
  store i32 %0, i32 addrspace(1)* %out
@ -181,7 +181,7 @@ entry:
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
 ; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @tgid_z_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_z_legacy(i32 addrspace(1)* %out) {
 entry:
  %0 = call i32 @llvm.r600.read.tgid.z() #0
  store i32 %0, i32 addrspace(1)* %out
@ -194,7 +194,7 @@ entry:

 ; FUNC-LABEL: {{^}}tidig_x_legacy:
 ; GCN-NOHSA: buffer_store_dword v0
-define void @tidig_x_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_x_legacy(i32 addrspace(1)* %out) {
 entry:
  %0 = call i32 @llvm.r600.read.tidig.x() #0
  store i32 %0, i32 addrspace(1)* %out
@ -208,7 +208,7 @@ entry:
 ; FUNC-LABEL: {{^}}tidig_y_legacy:

 ; GCN-NOHSA: buffer_store_dword v1
-define void @tidig_y_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_y_legacy(i32 addrspace(1)* %out) {
 entry:
  %0 = call i32 @llvm.r600.read.tidig.y() #0
  store i32 %0, i32 addrspace(1)* %out
@ -221,7 +221,7 @@ entry:

 ; FUNC-LABEL: {{^}}tidig_z_legacy:
 ; GCN-NOHSA: buffer_store_dword v2
-define void @tidig_z_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_z_legacy(i32 addrspace(1)* %out) {
 entry:
  %0 = call i32 @llvm.r600.read.tidig.z() #0
  store i32 %0, i32 addrspace(1)* %out
--- a/test/CodeGen/AMDGPU/and-gcn.ll
+++ b/test/CodeGen/AMDGPU/and-gcn.ll
@ -4,7 +4,7 @@
 ; FUNC-LABEL: {{^}}v_and_i64_br:
 ; SI: v_and_b32
 ; SI: v_and_b32
-define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
 entry:
  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
  %tmp0 = icmp eq i32 %tid, 0
--- a/test/CodeGen/AMDGPU/and.ll
+++ b/test/CodeGen/AMDGPU/and.ll
@ -11,7 +11,7 @@ declare i32 @llvm.r600.read.tidig.x() #0
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}

-define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
  %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
  %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
  %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@ -31,7 +31,7 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}

-define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
  %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
  %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
  %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@ -42,7 +42,7 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {

 ; FUNC-LABEL: {{^}}s_and_i32:
 ; SI: s_and_b32
-define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
  %and = and i32 %a, %b
  store i32 %and, i32 addrspace(1)* %out, align 4
  ret void
@ -50,7 +50,7 @@ define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {

 ; FUNC-LABEL: {{^}}s_and_constant_i32:
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687
-define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
  %and = and i32 %a, 1234567
  store i32 %and, i32 addrspace(1)* %out, align 4
  ret void
@ -66,7 +66,7 @@ define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
 ; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]]
 ; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
 ; SI: buffer_store_dword [[VK]]
-define void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) {
  %and = and i32 %a, 1234567

  ; Just to stop future replacement of copy to vgpr + store with VALU op.
@ -83,7 +83,7 @@ define void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32
 ; SI: s_add_i32
 ; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, [[K]]
 ; SI: buffer_store_dword [[VK]]
-define void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) {
  %and = and i32 %a, 1234567
  %foo = add i32 %and, 1234567
  %bar = add i32 %foo, %b
@ -93,7 +93,7 @@ define void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32

 ; FUNC-LABEL: {{^}}v_and_i32_vgpr_vgpr:
 ; SI: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
  %tid = call i32 @llvm.r600.read.tidig.x() #0
  %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
  %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@ -109,7 +109,7 @@ define void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; SI-DAG: s_load_dword [[SA:s[0-9]+]]
 ; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]]
 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]]
-define void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1)* %bptr) {
  %tid = call i32 @llvm.r600.read.tidig.x() #0
  %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -123,7 +123,7 @@ define void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1
 ; SI-DAG: s_load_dword [[SA:s[0-9]+]]
 ; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]]
 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]]
-define void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 %b) {
+define amdgpu_kernel void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 %b) {
  %tid = call i32 @llvm.r600.read.tidig.x() #0
  %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -135,7 +135,7 @@ define void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr

 ; FUNC-LABEL: {{^}}v_and_constant_i32
 ; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}}
-define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
  %a = load i32, i32 addrspace(1)* %aptr, align 4
  %and = and i32 %a, 1234567
  store i32 %and, i32 addrspace(1)* %out, align 4
@ -144,7 +144,7 @@ define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr)

 ; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32
 ; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}}
-define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
  %a = load i32, i32 addrspace(1)* %aptr, align 4
  %and = and i32 %a, 64
  store i32 %and, i32 addrspace(1)* %out, align 4
@ -153,7 +153,7 @@ define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %

 ; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32
 ; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}}
-define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
  %a = load i32, i32 addrspace(1)* %aptr, align 4
  %and = and i32 %a, -16
  store i32 %and, i32 addrspace(1)* %out, align 4
@ -162,7 +162,7 @@ define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1

 ; FUNC-LABEL: {{^}}s_and_i64
 ; SI: s_and_b64
-define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
  %and = and i64 %a, %b
  store i64 %and, i64 addrspace(1)* %out, align 8
  ret void
@ -171,7 +171,7 @@ define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 ; FIXME: Should use SGPRs
 ; FUNC-LABEL: {{^}}s_and_i1:
 ; SI: v_and_b32
-define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
+define amdgpu_kernel void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
  %and = and i1 %a, %b
  store i1 %and, i1 addrspace(1)* %out
  ret void
@ -181,7 +181,7 @@ define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000{{$}}
 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80{{$}}
 ; SI: buffer_store_dwordx2
-define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
  %and = and i64 %a, 549756338176
  store i64 %and, i64 addrspace(1)* %out, align 8
  ret void
@ -191,7 +191,7 @@ define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
 ; XSI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0x80000{{$}}
 ; XSI-DAG: s_mov_b32 s[[KHI:[0-9]+]], 0x80{{$}}
 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}
-define void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
  %and0 = and i64 %a, 549756338176
  %and1 = and i64 %b, 549756338176
  store volatile i64 %and0, i64 addrspace(1)* %out
@ -205,7 +205,7 @@ define void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687{{$}}
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
  %and = and i64 %a, 1234567
  store i64 %and, i64 addrspace(1)* %out, align 8
  ret void
@ -223,7 +223,7 @@ define void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
 ; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
  %shl.a = shl i64 %a, 1
  %shl.b = shl i64 %b, 1
  %and0 = and i64 %shl.a, 62
@ -238,7 +238,7 @@ define void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64
 ; FUNC-LABEL: {{^}}v_and_i64:
 ; SI: v_and_b32
 ; SI: v_and_b32
-define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
  %a = load i64, i64 addrspace(1)* %aptr, align 8
  %b = load i64, i64 addrspace(1)* %bptr, align 8
  %and = and i64 %a, %b
@ -250,7 +250,7 @@ define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addr
 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0xab19b207, {{v[0-9]+}}
 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, {{v[0-9]+}}
 ; SI: buffer_store_dwordx2
-define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
  %a = load i64, i64 addrspace(1)* %aptr, align 8
  %and = and i64 %a, 1231231234567
  store i64 %and, i64 addrspace(1)* %out, align 8
@ -268,7 +268,7 @@ define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr)
 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI1]]
 ; SI: buffer_store_dwordx2
 ; SI: buffer_store_dwordx2
-define void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
  %a = load volatile i64, i64 addrspace(1)* %aptr
  %b = load volatile i64, i64 addrspace(1)* %aptr
  %and0 = and i64 %a, 1231231234567
@ -288,7 +288,7 @@ define void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO1]]
-define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
  %a = load volatile i64, i64 addrspace(1)* %aptr
  %b = load volatile i64, i64 addrspace(1)* %aptr
  %and0 = and i64 %a, 63
@ -304,7 +304,7 @@ define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspac
 ; SI: v_and_b32_e32 {{v[0-9]+}}, 0x12d687, [[VAL]]
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
  %a = load i64, i64 addrspace(1)* %aptr, align 8
  %and = and i64 %a, 1234567
  store i64 %and, i64 addrspace(1)* %out, align 8
@ -317,7 +317,7 @@ define void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
  %a = load i64, i64 addrspace(1)* %aptr, align 8
  %and = and i64 %a, 64
  store i64 %and, i64 addrspace(1)* %out, align 8
@ -331,7 +331,7 @@ define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %apt
 ; SI: v_and_b32_e32 v[[VAL_LO]], -8, v[[VAL_LO]]
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
-define void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
  %a = load i64, i64 addrspace(1)* %aptr, align 8
  %and = and i64 %a, -8
  store i64 %and, i64 addrspace(1)* %out, align 8
@ -344,7 +344,7 @@ define void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 64
 ; SI-NOT: and
 ; SI: buffer_store_dword
-define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
  %and = and i64 %a, 64
  store i64 %and, i64 addrspace(1)* %out, align 8
  ret void
@ -358,7 +358,7 @@ define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 ; SI-NOT: and
 ; SI: s_add_u32
 ; SI-NEXT: s_addc_u32
-define void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) {
  %shl = shl i64 %a, 1
  %and = and i64 %shl, 64
  %add = add i64 %and, %b
@ -372,7 +372,7 @@ define void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrsp
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
  %and = and i64 %a, 1
  store i64 %and, i64 addrspace(1)* %out, align 8
  ret void
@ -387,7 +387,7 @@ define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3ff00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
  %and = and i64 %a, 4607182418800017408
  store i64 %and, i64 addrspace(1)* %out, align 8
  ret void
@ -402,7 +402,7 @@ define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbff00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
  %and = and i64 %a, 13830554455654793216
  store i64 %and, i64 addrspace(1)* %out, align 8
  ret void
@ -417,7 +417,7 @@ define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3fe00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
  %and = and i64 %a, 4602678819172646912
  store i64 %and, i64 addrspace(1)* %out, align 8
  ret void
@ -432,7 +432,7 @@ define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbfe00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
  %and = and i64 %a, 13826050856027422720
  store i64 %and, i64 addrspace(1)* %out, align 8
  ret void
@ -445,7 +445,7 @@ define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 2.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
  %and = and i64 %a, 4611686018427387904
  store i64 %and, i64 addrspace(1)* %out, align 8
  ret void
@ -458,7 +458,7 @@ define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, -2.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
  %and = and i64 %a, 13835058055282163712
  store i64 %and, i64 addrspace(1)* %out, align 8
  ret void
@ -473,7 +473,7 @@ define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x40100000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
  %and = and i64 %a, 4616189618054758400
  store i64 %and, i64 addrspace(1)* %out, align 8
  ret void
@ -488,7 +488,7 @@ define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xc0100000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
  %and = and i64 %a, 13839561654909534208
  store i64 %and, i64 addrspace(1)* %out, align 8
  ret void
@ -505,7 +505,7 @@ define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
  %and = and i64 %a, 1082130432
  store i64 %and, i64 addrspace(1)* %out, align 8
  ret void
@ -518,7 +518,7 @@ define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
  %and = and i64 %a, -1065353216
  store i64 %and, i64 addrspace(1)* %out, align 8
  ret void
@ -531,7 +531,7 @@ define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrsp
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
  %and = and i64 %a, 4647714815446351872
  store i64 %and, i64 addrspace(1)* %out, align 8
  ret void
@ -544,7 +544,7 @@ define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrs
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
  %and = and i64 %a, 13871086852301127680
  store i64 %and, i64 addrspace(1)* %out, align 8
  ret void
--- a/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@ -11,22 +11,22 @@ declare i32 @llvm.amdgcn.workitem.id.z() #0
 declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
 declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0

-; HSA: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
  %val = call i32 @llvm.amdgcn.workgroup.id.x()
  store i32 %val, i32 addrspace(1)* %ptr
  ret void
 }

-; HSA: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
  %val = call i32 @llvm.amdgcn.workgroup.id.y()
  store i32 %val, i32 addrspace(1)* %ptr
  ret void
 }

-; HSA: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
  %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
  store volatile i32 %val0, i32 addrspace(1)* %ptr
  %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
@ -34,8 +34,8 @@ define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
  ret void
 }

-; HSA: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
  %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
  %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
  store volatile i32 %val0, i32 addrspace(1)* %ptr
@ -43,15 +43,15 @@ define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
  ret void
 }

-; HSA: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
  %val = call i32 @llvm.amdgcn.workgroup.id.z()
  store i32 %val, i32 addrspace(1)* %ptr
  ret void
 }

-; HSA: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
  %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
  %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
  store volatile i32 %val0, i32 addrspace(1)* %ptr
@ -59,8 +59,8 @@ define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
  ret void
 }

-; HSA: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
  %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
  %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
  store volatile i32 %val0, i32 addrspace(1)* %ptr
@ -68,8 +68,8 @@ define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
  ret void
 }

-; HSA: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
  %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
  %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
  %val2 = call i32 @llvm.amdgcn.workgroup.id.z()
@ -79,29 +79,29 @@ define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
  ret void
 }

-; HSA: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
  %val = call i32 @llvm.amdgcn.workitem.id.x()
  store i32 %val, i32 addrspace(1)* %ptr
  ret void
 }

-; HSA: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
-define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
+define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
  %val = call i32 @llvm.amdgcn.workitem.id.y()
  store i32 %val, i32 addrspace(1)* %ptr
  ret void
 }

-; HSA: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
-define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
+define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
  %val = call i32 @llvm.amdgcn.workitem.id.z()
  store i32 %val, i32 addrspace(1)* %ptr
  ret void
 }

-; HSA: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
  %val0 = call i32 @llvm.amdgcn.workitem.id.x()
  %val1 = call i32 @llvm.amdgcn.workgroup.id.x()
  store volatile i32 %val0, i32 addrspace(1)* %ptr
@ -109,8 +109,8 @@ define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
  ret void
 }

-; HSA: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
-define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
+define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
  %val0 = call i32 @llvm.amdgcn.workitem.id.y()
  %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
  store volatile i32 %val0, i32 addrspace(1)* %ptr
@ -118,8 +118,8 @@ define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
  ret void
 }

-; HSA: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
-define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
+define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
  %val0 = call i32 @llvm.amdgcn.workitem.id.x()
  %val1 = call i32 @llvm.amdgcn.workitem.id.y()
  %val2 = call i32 @llvm.amdgcn.workitem.id.z()
@ -129,8 +129,8 @@ define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
  ret void
 }

-; HSA: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
-define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
+define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
  %val0 = call i32 @llvm.amdgcn.workitem.id.x()
  %val1 = call i32 @llvm.amdgcn.workitem.id.y()
  %val2 = call i32 @llvm.amdgcn.workitem.id.z()
@ -146,8 +146,8 @@ define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
  ret void
 }

-; HSA: define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 {
-define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 {
+define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
  %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
  %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
  %val = load i32, i32 addrspace(2)* %bc
@ -155,8 +155,8 @@ define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
  ret void
 }

-; HSA: define void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 {
-define void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 {
+define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
  %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
  %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
  %val = load i32, i32 addrspace(2)* %bc
@ -164,58 +164,58 @@ define void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
  ret void
 }

-; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
-define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
+define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
  %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
  store volatile i32 0, i32 addrspace(4)* %stof
  ret void
 }

-; HSA: define void @use_private_to_flat_addrspacecast(i32* %ptr) #11 {
-define void @use_private_to_flat_addrspacecast(i32* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #11 {
+define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #1 {
  %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
  store volatile i32 0, i32 addrspace(4)* %stof
  ret void
 }

-; HSA: define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
  %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
  store volatile i32 0, i32 addrspace(3)* %ftos
  ret void
 }

-; HSA: define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
  %ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
  store volatile i32 0, i32* %ftos
  ret void
 }

 ; No-op addrspacecast should not use queue ptr
-; HSA: define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
-define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
  %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
  store volatile i32 0, i32 addrspace(4)* %stof
  ret void
 }

-; HSA: define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
-define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
+define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
  %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
  %ld = load volatile i32, i32 addrspace(4)* %stof
  ret void
 }

-; HSA: define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
  %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
  store volatile i32 0, i32 addrspace(1)* %ftos
  ret void
 }

-; HSA: define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
  %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
  %ld = load volatile i32, i32 addrspace(2)* %ftos
  ret void
--- a/test/CodeGen/AMDGPU/annotate-kernel-features.ll
+++ b/test/CodeGen/AMDGPU/annotate-kernel-features.ll
@ -12,22 +12,22 @@ declare i32 @llvm.r600.read.local.size.x() #0
 declare i32 @llvm.r600.read.local.size.y() #0
 declare i32 @llvm.r600.read.local.size.z() #0

-; ALL: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
  %val = call i32 @llvm.r600.read.tgid.x()
  store i32 %val, i32 addrspace(1)* %ptr
  ret void
 }

-; ALL: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
  %val = call i32 @llvm.r600.read.tgid.y()
  store i32 %val, i32 addrspace(1)* %ptr
  ret void
 }

-; ALL: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
  %val0 = call i32 @llvm.r600.read.tgid.y()
  store volatile i32 %val0, i32 addrspace(1)* %ptr
  %val1 = call i32 @llvm.r600.read.tgid.y()
@ -35,8 +35,8 @@ define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
  ret void
 }

-; ALL: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
  %val0 = call i32 @llvm.r600.read.tgid.x()
  %val1 = call i32 @llvm.r600.read.tgid.y()
  store volatile i32 %val0, i32 addrspace(1)* %ptr
@ -44,15 +44,15 @@ define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
  ret void
 }

-; ALL: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
  %val = call i32 @llvm.r600.read.tgid.z()
  store i32 %val, i32 addrspace(1)* %ptr
  ret void
 }

-; ALL: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
  %val0 = call i32 @llvm.r600.read.tgid.x()
  %val1 = call i32 @llvm.r600.read.tgid.z()
  store volatile i32 %val0, i32 addrspace(1)* %ptr
@ -60,8 +60,8 @@ define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
  ret void
 }

-; ALL: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
  %val0 = call i32 @llvm.r600.read.tgid.y()
  %val1 = call i32 @llvm.r600.read.tgid.z()
  store volatile i32 %val0, i32 addrspace(1)* %ptr
@ -69,8 +69,8 @@ define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
  ret void
 }

-; ALL: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
  %val0 = call i32 @llvm.r600.read.tgid.x()
  %val1 = call i32 @llvm.r600.read.tgid.y()
  %val2 = call i32 @llvm.r600.read.tgid.z()
@ -80,29 +80,29 @@ define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
  ret void
 }

-; ALL: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
  %val = call i32 @llvm.r600.read.tidig.x()
  store i32 %val, i32 addrspace(1)* %ptr
  ret void
 }

-; ALL: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
-define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
+define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
  %val = call i32 @llvm.r600.read.tidig.y()
  store i32 %val, i32 addrspace(1)* %ptr
  ret void
 }

-; ALL: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
-define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
+define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
  %val = call i32 @llvm.r600.read.tidig.z()
  store i32 %val, i32 addrspace(1)* %ptr
  ret void
 }

-; ALL: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
  %val0 = call i32 @llvm.r600.read.tidig.x()
  %val1 = call i32 @llvm.r600.read.tgid.x()
  store volatile i32 %val0, i32 addrspace(1)* %ptr
@ -110,8 +110,8 @@ define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
  ret void
 }

-; ALL: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
-define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
+define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
  %val0 = call i32 @llvm.r600.read.tidig.y()
  %val1 = call i32 @llvm.r600.read.tgid.y()
  store volatile i32 %val0, i32 addrspace(1)* %ptr
@ -119,8 +119,8 @@ define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
  ret void
 }

-; ALL: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
-define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
+define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
  %val0 = call i32 @llvm.r600.read.tidig.x()
  %val1 = call i32 @llvm.r600.read.tidig.y()
  %val2 = call i32 @llvm.r600.read.tidig.z()
@ -130,8 +130,8 @@ define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
  ret void
 }

-; ALL: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
-define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
+define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
  %val0 = call i32 @llvm.r600.read.tidig.x()
  %val1 = call i32 @llvm.r600.read.tidig.y()
  %val2 = call i32 @llvm.r600.read.tidig.z()
@ -147,25 +147,25 @@ define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
  ret void
 }

-; HSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #10 {
-; NOHSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #10 {
+; NOHSA: define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
  %val = call i32 @llvm.r600.read.local.size.x()
  store i32 %val, i32 addrspace(1)* %ptr
  ret void
 }

-; HSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #10 {
-; NOHSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
-define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #10 {
+; NOHSA: define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
  %val = call i32 @llvm.r600.read.local.size.y()
  store i32 %val, i32 addrspace(1)* %ptr
  ret void
 }

-; HSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #10 {
-; NOHSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
-define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #10 {
+; NOHSA: define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
  %val = call i32 @llvm.r600.read.local.size.z()
  store i32 %val, i32 addrspace(1)* %ptr
  ret void
--- a/test/CodeGen/AMDGPU/anonymous-gv.ll
+++ b/test/CodeGen/AMDGPU/anonymous-gv.ll
@ -6,13 +6,13 @@
 ; CHECK-LABEL: {{^}}test:
 ; CHECK: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, __unnamed_1
 ; CHECK: s_endpgm
-define void @test() {
+define amdgpu_kernel void @test() {
  store i32 1, i32 addrspace(1)* @0
  ret void
 }

 ; CHECK-LABEL: {{^}}__unnamed_2:
 ; CHECK: s_endpgm
-define void @1() {
+define amdgpu_kernel void @1() {
  ret void
 }
--- a/test/CodeGen/AMDGPU/anyext.ll
+++ b/test/CodeGen/AMDGPU/anyext.ll
@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone

 ; GCN-LABEL: {{^}}anyext_i1_i32:
 ; GCN: v_cndmask_b32_e64
-define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) {
 entry:
  %tmp = icmp eq i32 %cond, 0
  %tmp1 = zext i1 %tmp to i8
@ -22,7 +22,7 @@ entry:
 ; VI: v_xor_b32_e32 [[XOR:v[0-9]+]], -1, [[ADD]]
 ; VI: v_and_b32_e32 [[AND:v[0-9]+]], 1, [[XOR]]
 ; VI: buffer_store_dword [[AND]]
-define void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %a, i16 addrspace(1)* %b) {
+define amdgpu_kernel void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %a, i16 addrspace(1)* %b) {
 entry:
  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
--- a/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@ -24,7 +24,7 @@ declare void @llvm.amdgcn.s.barrier() #2

 ; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 64
 ; SI-PROMOTE: ds_write_b32 [[PTRREG]]
-define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 {
+define amdgpu_kernel void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 {
  %alloca = alloca [16 x i32], align 16
  %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
  %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
--- a/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
+++ b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
 ; SI-DAG: v_mul_lo_i32
 ; SI-DAG: v_mul_hi_i32
 ; SI: s_endpgm
-define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
  %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
  %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
  %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0
--- a/test/CodeGen/AMDGPU/ashr.v2i16.ll
+++ b/test/CodeGen/AMDGPU/ashr.v2i16.ll
@ -13,7 +13,7 @@
 ; CIVI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
 ; CIVI: v_or_b32_e32
-define void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
+define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
  %result = ashr <2 x i16> %lhs, %rhs
  store <2 x i16> %result, <2 x i16> addrspace(1)* %out
  ret void
@ -40,7 +40,7 @@ define void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16
 ; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
 ; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
 ; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@ -57,7 +57,7 @@ define void @v_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)*
 ; GFX9: s_load_dword [[RHS:s[0-9]+]]
 ; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
-define void @ashr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+define amdgpu_kernel void @ashr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@ -72,7 +72,7 @@ define void @ashr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)
 ; GFX9: s_load_dword [[LHS:s[0-9]+]]
 ; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
-define void @ashr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+define amdgpu_kernel void @ashr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@ -86,7 +86,7 @@ define void @ashr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)
 ; GCN-LABEL: {{^}}ashr_imm_v_v2i16:
 ; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], -4
-define void @ashr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @ashr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@ -100,7 +100,7 @@ define void @ashr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(
 ; GCN-LABEL: {{^}}ashr_v_imm_v2i16:
 ; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
 ; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], 8, [[LHS]]
-define void @ashr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @ashr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@ -117,7 +117,7 @@ define void @ashr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: {{buffer|flat}}_store_dwordx2
-define void @v_ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
@ -135,7 +135,7 @@ define void @v_ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)*
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
 ; GCN: {{buffer|flat}}_store_dwordx2
-define void @ashr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @ashr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = sext i32 %tid to i64
  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
--- a/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
+++ b/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@ -12,7 +12,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
 ; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
  %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
  %result = extractvalue { i32, i1 } %pair, 0
@ -33,7 +33,7 @@ define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrs
 ; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
  %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
  %result = extractvalue { i64, i1 } %pair, 0
@ -45,7 +45,7 @@ define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrs
 ; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
  %sub = sub i32 %a, %b
  %add = add i32 %sub, 4
  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@ -65,7 +65,7 @@ define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i3
 ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
 ; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
  %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
  %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
  %result = extractvalue { i32, i1 } %pair, 0
@ -84,7 +84,7 @@ define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %sw
 ; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
 ; GCN: ds_cmpst_b64 [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind {
  %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
  %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
  %result = extractvalue { i64, i1 } %pair, 0
--- a/test/CodeGen/AMDGPU/atomic_load_add.ll
+++ b/test/CodeGen/AMDGPU/atomic_load_add.ll
@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}atomic_add_local:
 ; R600: LDS_ADD *
 ; SI: ds_add_u32
-define void @atomic_add_local(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_local(i32 addrspace(3)* %local) {
   %unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
   ret void
 }
@ -13,7 +13,7 @@ define void @atomic_add_local(i32 addrspace(3)* %local) {
 ; FUNC-LABEL: {{^}}atomic_add_local_const_offset:
 ; R600: LDS_ADD *
 ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
  %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
  ret void
@ -22,7 +22,7 @@ define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
 ; FUNC-LABEL: {{^}}atomic_add_ret_local:
 ; R600: LDS_ADD_RET *
 ; SI: ds_add_rtn_u32
-define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
  %val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
  store i32 %val, i32 addrspace(1)* %out
  ret void
@ -31,7 +31,7 @@ define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %loc
 ; FUNC-LABEL: {{^}}atomic_add_ret_local_const_offset:
 ; R600: LDS_ADD_RET *
 ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
-define void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
  %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
  store i32 %val, i32 addrspace(1)* %out
--- a/test/CodeGen/AMDGPU/atomic_load_sub.ll
+++ b/test/CodeGen/AMDGPU/atomic_load_sub.ll
@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}atomic_sub_local:
 ; R600: LDS_SUB *
 ; SI: ds_sub_u32
-define void @atomic_sub_local(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_local(i32 addrspace(3)* %local) {
   %unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
   ret void
 }
@ -13,7 +13,7 @@ define void @atomic_sub_local(i32 addrspace(3)* %local) {
 ; FUNC-LABEL: {{^}}atomic_sub_local_const_offset:
 ; R600: LDS_SUB *
 ; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
  %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
  ret void
@ -22,7 +22,7 @@ define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
 ; FUNC-LABEL: {{^}}atomic_sub_ret_local:
 ; R600: LDS_SUB_RET *
 ; SI: ds_sub_rtn_u32
-define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
  %val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
  store i32 %val, i32 addrspace(1)* %out
  ret void
@ -31,7 +31,7 @@ define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %loc
 ; FUNC-LABEL: {{^}}atomic_sub_ret_local_const_offset:
 ; R600: LDS_SUB_RET *
 ; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
-define void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
  %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
  %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
  store i32 %val, i32 addrspace(1)* %out
--- a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@ -5,7 +5,7 @@
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @min_64_max_64() #0 {
+define amdgpu_kernel void @min_64_max_64() #0 {
 entry:
  ret void
 }
@ -16,7 +16,7 @@ attributes #0 = {"amdgpu-flat-work-group-size"="64,64"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @min_64_max_128() #1 {
+define amdgpu_kernel void @min_64_max_128() #1 {
 entry:
  ret void
 }
@ -27,7 +27,7 @@ attributes #1 = {"amdgpu-flat-work-group-size"="64,128"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @min_128_max_128() #2 {
+define amdgpu_kernel void @min_128_max_128() #2 {
 entry:
  ret void
 }
@ -39,7 +39,7 @@ attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}
 ; CHECK: NumSGPRsForWavesPerEU: 13
 ; CHECK: NumVGPRsForWavesPerEU: 32
@var = addrspace(1) global float 0.0
-define void @min_1024_max_2048() #3 {
+define amdgpu_kernel void @min_1024_max_2048() #3 {
  %val0 = load volatile float, float addrspace(1)* @var
  %val1 = load volatile float, float addrspace(1)* @var
  %val2 = load volatile float, float addrspace(1)* @var
--- a/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
+++ b/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
@ -8,7 +8,7 @@

 ; ALL: SGPRBlocks: 1
 ; ALL: NumSGPRsForWavesPerEU: 9
-define void @max_9_sgprs(i32 addrspace(1)* %out1,
+define amdgpu_kernel void @max_9_sgprs(i32 addrspace(1)* %out1,

                          i32 addrspace(1)* %out2,
                          i32 addrspace(1)* %out3,
@ -49,7 +49,7 @@ define void @max_9_sgprs(i32 addrspace(1)* %out1,

 ; TOSMEM: SGPRBlocks: 1
 ; TOSMEM: NumSGPRsForWavesPerEU: 16
-define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
+define amdgpu_kernel void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
                                        i32 addrspace(1)* %out2,
                                        i32 addrspace(1)* %out3,
                                        i32 addrspace(1)* %out4,
@ -90,7 +90,7 @@ stores:

 ; XALL: SGPRBlocks: 2
 ; XALL: NumSGPRsForWavesPerEU: 18
-;define void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1,
+;define amdgpu_kernel void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1,
 ;                                        i32 addrspace(1)* %out2,
 ;                                        i32 addrspace(1)* %out3,
 ;                                        i32 addrspace(1)* %out4,
--- a/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
+++ b/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
@ -5,7 +5,7 @@
 ; CHECK-LABEL: {{^}}max_20_vgprs:
 ; CHECK: VGPRBlocks: 4
 ; CHECK: NumVGPRsForWavesPerEU: 20
-define void @max_20_vgprs() #1 {
+define amdgpu_kernel void @max_20_vgprs() #1 {
  %val0 = load volatile float, float addrspace(1)* @var
  %val1 = load volatile float, float addrspace(1)* @var
  %val2 = load volatile float, float addrspace(1)* @var
--- a/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@ -6,7 +6,7 @@
 ; CHECK: VGPRBlocks: 32
 ; CHECK: NumSGPRsForWavesPerEU: 102
 ; CHECK: NumVGPRsForWavesPerEU: 129
-define void @empty_exactly_1() #0 {
+define amdgpu_kernel void @empty_exactly_1() #0 {
 entry:
  ret void
 }
@ -18,7 +18,7 @@ attributes #0 = {"amdgpu-waves-per-eu"="1,1"}
 ; CHECK: VGPRBlocks: 10
 ; CHECK: NumSGPRsForWavesPerEU: 102
 ; CHECK: NumVGPRsForWavesPerEU: 41
-define void @empty_exactly_5() #1 {
+define amdgpu_kernel void @empty_exactly_5() #1 {
 entry:
  ret void
 }
@ -30,7 +30,7 @@ attributes #1 = {"amdgpu-waves-per-eu"="5,5"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_exactly_10() #2 {
+define amdgpu_kernel void @empty_exactly_10() #2 {
 entry:
  ret void
 }
@ -42,7 +42,7 @@ attributes #2 = {"amdgpu-waves-per-eu"="10,10"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_least_1() #3 {
+define amdgpu_kernel void @empty_at_least_1() #3 {
 entry:
  ret void
 }
@ -54,7 +54,7 @@ attributes #3 = {"amdgpu-waves-per-eu"="1"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_least_5() #4 {
+define amdgpu_kernel void @empty_at_least_5() #4 {
 entry:
  ret void
 }
@ -66,7 +66,7 @@ attributes #4 = {"amdgpu-waves-per-eu"="5"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_least_10() #5 {
+define amdgpu_kernel void @empty_at_least_10() #5 {
 entry:
  ret void
 }
@ -80,7 +80,7 @@ attributes #5 = {"amdgpu-waves-per-eu"="10"}
 ; CHECK: VGPRBlocks: 10
 ; CHECK: NumSGPRsForWavesPerEU: 102
 ; CHECK: NumVGPRsForWavesPerEU: 41
-define void @empty_at_most_5() #6 {
+define amdgpu_kernel void @empty_at_most_5() #6 {
 entry:
  ret void
 }
@ -92,7 +92,7 @@ attributes #6 = {"amdgpu-waves-per-eu"="1,5"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_most_10() #7 {
+define amdgpu_kernel void @empty_at_most_10() #7 {
 entry:
  ret void
 }
@ -106,7 +106,7 @@ attributes #7 = {"amdgpu-waves-per-eu"="1,10"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_between_5_and_10() #8 {
+define amdgpu_kernel void @empty_between_5_and_10() #8 {
 entry:
  ret void
 }
@ -120,7 +120,7 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"}
 ; CHECK: VGPRBlocks: 5
 ; CHECK: NumSGPRsForWavesPerEU: 13
 ; CHECK: NumVGPRsForWavesPerEU: 24
-define void @exactly_10() #9 {
+define amdgpu_kernel void @exactly_10() #9 {
  %val0 = load volatile float, float addrspace(1)* @var
  %val1 = load volatile float, float addrspace(1)* @var
  %val2 = load volatile float, float addrspace(1)* @var
--- a/test/CodeGen/AMDGPU/attr-unparseable.ll
+++ b/test/CodeGen/AMDGPU/attr-unparseable.ll
@ -1,56 +1,56 @@
 ; RUN: not llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s 2>&1 | FileCheck %s

 ; CHECK: can't parse integer attribute amdgpu-num-sgpr
-define void @unparseable_single_0() #0 {
+define amdgpu_kernel void @unparseable_single_0() #0 {
 entry:
  ret void
 }
 attributes #0 = {"amdgpu-num-sgpr"}

 ; CHECK: can't parse integer attribute amdgpu-num-sgpr
-define void @unparseable_single_1() #1 {
+define amdgpu_kernel void @unparseable_single_1() #1 {
 entry:
  ret void
 }
 attributes #1 = {"amdgpu-num-sgpr"="k"}

 ; CHECK: can't parse integer attribute amdgpu-num-sgpr
-define void @unparseable_single_2() #2 {
+define amdgpu_kernel void @unparseable_single_2() #2 {
 entry:
  ret void
 }
 attributes #2 = {"amdgpu-num-sgpr"="1,2"}

 ; CHECK: can't parse first integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_0() #3 {
+define amdgpu_kernel void @unparseable_pair_0() #3 {
 entry:
  ret void
 }
 attributes #3 = {"amdgpu-flat-work-group-size"}

 ; CHECK: can't parse first integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_1() #4 {
+define amdgpu_kernel void @unparseable_pair_1() #4 {
 entry:
  ret void
 }
 attributes #4 = {"amdgpu-flat-work-group-size"="k"}

 ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_2() #5 {
+define amdgpu_kernel void @unparseable_pair_2() #5 {
 entry:
  ret void
 }
 attributes #5 = {"amdgpu-flat-work-group-size"="1"}

 ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_3() #6 {
+define amdgpu_kernel void @unparseable_pair_3() #6 {
 entry:
  ret void
 }
 attributes #6 = {"amdgpu-flat-work-group-size"="1,k"}

 ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_4() #7 {
+define amdgpu_kernel void @unparseable_pair_4() #7 {
 entry:
  ret void
 }
--- a/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/test/CodeGen/AMDGPU/basic-branch.ll
@ -15,7 +15,7 @@

 ; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
-define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 {
+define amdgpu_kernel void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 {
  %cmp = icmp ne i32 %val, 0
  br i1 %cmp, label %store, label %end

@ -39,7 +39,7 @@ end:

 ; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
-define void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 {
+define amdgpu_kernel void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 {
  %cmp0 = icmp ne i1 %val, 0
  br i1 %cmp0, label %store, label %end

--- a/test/CodeGen/AMDGPU/basic-loop.ll
+++ b/test/CodeGen/AMDGPU/basic-loop.ll
@ -2,7 +2,7 @@
 ; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s

 ; CHECK-LABEL: {{^}}test_loop:
-define void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
+define amdgpu_kernel void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
 entry:
  br label %loop.body

--- a/test/CodeGen/AMDGPU/bfe-patterns.ll
+++ b/test/CodeGen/AMDGPU/bfe-patterns.ll
@ -5,7 +5,7 @@
 ; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
 ; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[WIDTH]]
-define void @v_ubfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_ubfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
  %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
@ -32,7 +32,7 @@ define void @v_ubfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32

 ; GCN: [[BFE]]
 ; GCN: [[SHL]]
-define void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
  %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
@ -52,7 +52,7 @@ define void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(
 ; GCN: s_load_dword [[WIDTH:s[0-9]+]]
 ; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]]
 ; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
-define void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
  %sub = sub i32 32, %width
@ -68,7 +68,7 @@ define void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
 ; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]]
 ; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]]
 ; GCN-NEXT: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]]
-define void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
  %sub = sub i32 32, %width
@ -83,7 +83,7 @@ define void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32
 ; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
 ; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[WIDTH]]
-define void @v_sbfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_sbfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
  %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
@ -110,7 +110,7 @@ define void @v_sbfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32

 ; GCN: [[BFE]]
 ; GCN: [[SHL]]
-define void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
  %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
@ -130,7 +130,7 @@ define void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(
 ; GCN: s_load_dword [[WIDTH:s[0-9]+]]
 ; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]]
 ; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
-define void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
  %sub = sub i32 32, %width
@ -146,7 +146,7 @@ define void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
 ; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]]
 ; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]]
 ; GCN-NEXT: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]]
-define void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
  %sub = sub i32 32, %width
--- a/test/CodeGen/AMDGPU/bfe_uint.ll
+++ b/test/CodeGen/AMDGPU/bfe_uint.ll
@ -2,7 +2,7 @@

 ; CHECK: {{^}}bfe_def:
 ; CHECK: BFE_UINT
-define void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
 entry:
  %0 = lshr i32 %x, 5
  %1 = and i32 %0, 15 ; 0xf
@ -17,7 +17,7 @@ entry:

 ; CHECK: {{^}}bfe_shift:
 ; CHECK-NOT: BFE_UINT
-define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @bfe_shift(i32 addrspace(1)* %out, i32 %x) {
 entry:
  %0 = lshr i32 %x, 16
  %1 = and i32 %0, 65535 ; 0xffff
--- a/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/test/CodeGen/AMDGPU/bfi_int.ll
@ -9,7 +9,7 @@
 ; R600: BFI_INT
 ; SI:   @bfi_def
 ; SI:   v_bfi_b32
-define void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+define amdgpu_kernel void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
  %0 = xor i32 %x, -1
  %1 = and i32 %z, %0
@ -25,7 +25,7 @@ entry:
 ; R600: BFI_INT
 ; SI:   @bfi_sha256_ch
 ; SI:   v_bfi_b32
-define void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+define amdgpu_kernel void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
  %0 = xor i32 %y, %z
  %1 = and i32 %x, %0
@ -42,7 +42,7 @@ entry:
 ; SI: v_xor_b32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}}
 ; SI: v_bfi_b32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}}

-define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+define amdgpu_kernel void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
  %0 = and i32 %x, %z
  %1 = or i32 %x, %z
--- a/test/CodeGen/AMDGPU/bfm.ll
+++ b/test/CodeGen/AMDGPU/bfm.ll
@ -4,7 +4,7 @@

 ; FUNC-LABEL: {{^}}bfm_pattern:
 ; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
  %a = shl i32 1, %x
  %b = sub i32 %a, 1
  %c = shl i32 %b, %y
@ -14,7 +14,7 @@ define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {

 ; FUNC-LABEL: {{^}}bfm_pattern_simple:
 ; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0
-define void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 {
  %a = shl i32 1, %x
  %b = sub i32 %a, 1
  store i32 %b, i32 addrspace(1)* %out
--- a/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
+++ b/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
@ -11,7 +11,7 @@
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) {
  %vec0.bc = bitcast <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8> to <8 x float>
  store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out

@ -27,7 +27,7 @@ define void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %ou
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) {
  %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <8 x float>
  store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out

@ -43,7 +43,7 @@ define void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %ou
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) {
  %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <4 x double>
  store volatile <4 x double> %vec0.bc, <4 x double> addrspace(1)* %out

@ -59,7 +59,7 @@ define void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %o
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) {
  %vec0.bc = bitcast <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 8> to <8 x float>
  store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out

@ -70,7 +70,7 @@ define void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %o

 ; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source:
 ; GCN-NOT: store_dword
-define void @store_value_lowered_to_undef_bitcast_source(<2 x i32> addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {
+define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source(<2 x i32> addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {
  %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 %c) #1
  %bc = bitcast i64 %undef to <2 x i32>
  store volatile <2 x i32> %bc, <2 x i32> addrspace(1)* %out
@ -79,7 +79,7 @@ define void @store_value_lowered_to_undef_bitcast_source(<2 x i32> addrspace(1)*

 ; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source_extractelt:
 ; GCN-NOT: store_dword
-define void @store_value_lowered_to_undef_bitcast_source_extractelt(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {
+define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source_extractelt(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {
  %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 %c) #1
  %bc = bitcast i64 %undef to <2 x i32>
  %elt1 = extractelement <2 x i32> %bc, i32 1
--- a/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
+++ b/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
@ -7,7 +7,7 @@
 ; GCN-LABEL: {{^}}materialize_0_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_0_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_0_i32(i32 addrspace(1)* %out) {
  store i32 0, i32 addrspace(1)* %out
  ret void
 }
@ -16,7 +16,7 @@ define void @materialize_0_i32(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_0_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_0_i64(i64 addrspace(1)* %out) {
  store i64 0, i64 addrspace(1)* %out
  ret void
 }
@ -24,7 +24,7 @@ define void @materialize_0_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_neg1_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], -1{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_neg1_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_neg1_i32(i32 addrspace(1)* %out) {
  store i32 -1, i32 addrspace(1)* %out
  ret void
 }
@ -33,7 +33,7 @@ define void @materialize_neg1_i32(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_neg1_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_neg1_i64(i64 addrspace(1)* %out) {
  store i64 -1, i64 addrspace(1)* %out
  ret void
 }
@ -41,7 +41,7 @@ define void @materialize_neg1_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_signbit_i32:
 ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_signbit_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_signbit_i32(i32 addrspace(1)* %out) {
  store i32 -2147483648, i32 addrspace(1)* %out
  ret void
 }
@ -50,7 +50,7 @@ define void @materialize_signbit_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_signbit_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_signbit_i64(i64 addrspace(1)* %out) {
  store i64  -9223372036854775808, i64 addrspace(1)* %out
  ret void
 }
@ -58,7 +58,7 @@ define void @materialize_signbit_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_neg16_i32:
 ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], -16{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) {
  store i32 268435455, i32 addrspace(1)* %out
  ret void
 }
@ -67,7 +67,7 @@ define void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], -16{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) {
  store i64  1152921504606846975, i64 addrspace(1)* %out
  ret void
 }
@ -75,7 +75,7 @@ define void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_neg17_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xf7ffffff{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) {
  store i32 -134217729, i32 addrspace(1)* %out
  ret void
 }
@ -84,7 +84,7 @@ define void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0xf7ffffff{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) {
  store i64 -576460752303423489, i64 addrspace(1)* %out
  ret void
 }
@ -92,7 +92,7 @@ define void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_64_i32:
 ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], 64{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_64_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_64_i32(i32 addrspace(1)* %out) {
  store i32 33554432, i32 addrspace(1)* %out
  ret void
 }
@ -101,7 +101,7 @@ define void @materialize_rev_64_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 64{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_64_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_64_i64(i64 addrspace(1)* %out) {
  store i64 144115188075855872, i64 addrspace(1)* %out
  ret void
 }
@ -109,7 +109,7 @@ define void @materialize_rev_64_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_65_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x82000000{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_65_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_65_i32(i32 addrspace(1)* %out) {
  store i32 -2113929216, i32 addrspace(1)* %out
  ret void
 }
@ -118,7 +118,7 @@ define void @materialize_rev_65_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0x82000000{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_65_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_65_i64(i64 addrspace(1)* %out) {
  store i64 -9079256848778919936, i64 addrspace(1)* %out
  ret void
 }
@ -126,7 +126,7 @@ define void @materialize_rev_65_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_3_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], -2.0{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_3_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_3_i32(i32 addrspace(1)* %out) {
  store i32 -1073741824, i32 addrspace(1)* %out
  ret void
 }
@ -135,7 +135,7 @@ define void @materialize_rev_3_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], -2.0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_3_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_3_i64(i64 addrspace(1)* %out) {
  store i64 -4611686018427387904, i64 addrspace(1)* %out
  ret void
 }
@ -143,7 +143,7 @@ define void @materialize_rev_3_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_1.0_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1fc{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) {
  store i32 508, i32 addrspace(1)* %out
  ret void
 }
@ -152,70 +152,70 @@ define void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0x1fc{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_1.0_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_1.0_i64(i64 addrspace(1)* %out) {
  store i64 508, i64 addrspace(1)* %out
  ret void
 }

 ; GCN-LABEL: {{^}}s_materialize_0_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 0{{$}}
-define void @s_materialize_0_i32() {
+define amdgpu_kernel void @s_materialize_0_i32() {
  call void asm sideeffect "; use $0", "s"(i32 0)
  ret void
 }

 ; GCN-LABEL: {{^}}s_materialize_1_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 1{{$}}
-define void @s_materialize_1_i32() {
+define amdgpu_kernel void @s_materialize_1_i32() {
  call void asm sideeffect "; use $0", "s"(i32 1)
  ret void
 }

 ; GCN-LABEL: {{^}}s_materialize_neg1_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, -1{{$}}
-define void @s_materialize_neg1_i32() {
+define amdgpu_kernel void @s_materialize_neg1_i32() {
  call void asm sideeffect "; use $0", "s"(i32 -1)
  ret void
 }

 ; GCN-LABEL: {{^}}s_materialize_signbit_i32:
 ; GCN: s_brev_b32 s{{[0-9]+}}, 1{{$}}
-define void @s_materialize_signbit_i32() {
+define amdgpu_kernel void @s_materialize_signbit_i32() {
  call void asm sideeffect "; use $0", "s"(i32 -2147483648)
  ret void
 }

 ; GCN-LABEL: {{^}}s_materialize_rev_64_i32:
 ; GCN: s_brev_b32 s{{[0-9]+}}, 64{{$}}
-define void @s_materialize_rev_64_i32() {
+define amdgpu_kernel void @s_materialize_rev_64_i32() {
  call void asm sideeffect "; use $0", "s"(i32 33554432)
  ret void
 }

 ; GCN-LABEL: {{^}}s_materialize_rev_65_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 0x82000000{{$}}
-define void @s_materialize_rev_65_i32() {
+define amdgpu_kernel void @s_materialize_rev_65_i32() {
  call void asm sideeffect "; use $0", "s"(i32 -2113929216)
  ret void
 }

 ; GCN-LABEL: {{^}}s_materialize_rev_neg16_i32:
 ; GCN: s_brev_b32 s{{[0-9]+}}, -16{{$}}
-define void @s_materialize_rev_neg16_i32() {
+define amdgpu_kernel void @s_materialize_rev_neg16_i32() {
  call void asm sideeffect "; use $0", "s"(i32 268435455)
  ret void
 }

 ; GCN-LABEL: {{^}}s_materialize_rev_neg17_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 0xf7ffffff{{$}}
-define void @s_materialize_rev_neg17_i32() {
+define amdgpu_kernel void @s_materialize_rev_neg17_i32() {
  call void asm sideeffect "; use $0", "s"(i32 -134217729)
  ret void
 }

 ; GCN-LABEL: {{^}}s_materialize_rev_1.0_i32:
 ; GCN: s_movk_i32 s{{[0-9]+}}, 0x1fc{{$}}
-define void @s_materialize_rev_1.0_i32() {
+define amdgpu_kernel void @s_materialize_rev_1.0_i32() {
  call void asm sideeffect "; use $0", "s"(i32 508)
  ret void
 }
--- a/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/test/CodeGen/AMDGPU/bitreverse.ll
@ -14,7 +14,7 @@ declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1

 ; FUNC-LABEL: {{^}}s_brev_i16:
 ; SI: s_brev_b32 
-define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
+define amdgpu_kernel void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
  %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
  store i16 %brev, i16 addrspace(1)* %out
  ret void
@ -22,7 +22,7 @@ define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {

 ; FUNC-LABEL: {{^}}v_brev_i16:
 ; SI: v_bfrev_b32_e32
-define void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 {
  %val = load i16, i16 addrspace(1)* %valptr
  %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
  store i16 %brev, i16 addrspace(1)* %out
@ -35,7 +35,7 @@ define void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalia
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
 ; SI: s_endpgm
-define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 {
+define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 {
  %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
  store i32 %brev, i32 addrspace(1)* %out
  ret void
@ -46,7 +46,7 @@ define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 {
 ; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
  %val = load i32, i32 addrspace(1)* %valptr
  %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
  store i32 %brev, i32 addrspace(1)* %out
@ -56,7 +56,7 @@ define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalia
 ; FUNC-LABEL: {{^}}s_brev_v2i32:
 ; SI: s_brev_b32
 ; SI: s_brev_b32
-define void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 {
+define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 {
  %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
  store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
  ret void
@ -65,7 +65,7 @@ define void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val)
 ; FUNC-LABEL: {{^}}v_brev_v2i32:
 ; SI: v_bfrev_b32_e32
 ; SI: v_bfrev_b32_e32
-define void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
  %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr
  %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
  store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
@ -73,7 +73,7 @@ define void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrsp
 }

 ; FUNC-LABEL: {{^}}s_brev_i64:
-define void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {
+define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {
  %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
  store i64 %brev, i64 addrspace(1)* %out
  ret void
@ -81,7 +81,7 @@ define void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {

 ; FUNC-LABEL: {{^}}v_brev_i64:
 ; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0
-define void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
  %val = load i64, i64 addrspace(1)* %valptr
  %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
  store i64 %brev, i64 addrspace(1)* %out
@ -89,14 +89,14 @@ define void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalia
 }

 ; FUNC-LABEL: {{^}}s_brev_v2i64:
-define void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 {
+define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 {
  %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
  store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}v_brev_v2i64:
-define void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
  %val = load <2 x i64>, <2 x i64> addrspace(1)* %valptr
  %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
  store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
--- a/test/CodeGen/AMDGPU/br_cc.f16.ll
+++ b/test/CodeGen/AMDGPU/br_cc.f16.ll
@ -20,7 +20,7 @@
 ; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[B_F16]]
 ; GCN: s_endpgm
-define void @br_cc_f16(
+define amdgpu_kernel void @br_cc_f16(
    half addrspace(1)* %r,
    half addrspace(1)* %a,
    half addrspace(1)* %b) {
@ -59,7 +59,7 @@ two:
 ; GCN: two{{$}}
 ; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]

-define void @br_cc_f16_imm_a(
+define amdgpu_kernel void @br_cc_f16_imm_a(
    half addrspace(1)* %r,
    half addrspace(1)* %b) {
 entry:
@ -92,7 +92,7 @@ two:
 ; VI:  v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x3800{{$}}
 ; GCN: buffer_store_short v[[B_F16]]
 ; GCN: s_endpgm
-define void @br_cc_f16_imm_b(
+define amdgpu_kernel void @br_cc_f16_imm_b(
    half addrspace(1)* %r,
    half addrspace(1)* %a) {
 entry:
--- a/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/test/CodeGen/AMDGPU/branch-relax-spill.ll
@ -5,7 +5,7 @@

 ; FAIL: LLVM ERROR: Error while trying to spill VCC from class SReg_64: Cannot scavenge register without an emergency spill slot!

-define void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 {
 entry:
  %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={SGPR0}"() #0
  %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={SGPR1}"() #0
--- a/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/test/CodeGen/AMDGPU/branch-relaxation.ll
@ -26,7 +26,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
 ; GCN: buffer_store_dword [[V_CND]]
 ; GCN: s_endpgm
-define void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
 bb:
  %cmp = icmp eq i32 %cnd, 0
  br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
@ -68,7 +68,7 @@ bb3:
 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
 ; GCN: buffer_store_dword [[V_CND]]
 ; GCN: s_endpgm
-define void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
 bb0:
  %cmp = icmp eq i32 %cnd, 0
  br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch
@ -108,7 +108,7 @@ bb3:
 ; GCN: [[ENDBB]]:
 ; GCN: buffer_store_dword [[V_CND]]
 ; GCN: s_endpgm
-define void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 {
+define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 {
 bb0:
  %cmp = fcmp oeq float %cnd, 0.0
  br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
@ -141,7 +141,7 @@ bb3:
 ; GCN: s_or_b64 exec, exec, [[SAVE]]
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 {
 bb:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %tid.ext = zext i32 %tid to i64
@ -188,7 +188,7 @@ bb3:

 ; GCN-NEXT: [[ENDBB]]:
 ; GCN-NEXT: s_endpgm
-define void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 {
 bb:
  br label %bb2

@ -243,7 +243,7 @@ bb3:
 ; GCN: buffer_store_dword [[BB4_K]]
 ; GCN-NEXT: s_endpgm
 ; GCN-NEXT: .Lfunc_end{{[0-9]+}}:
-define void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
+define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
 bb0:
  %tmp = icmp ne i32 %arg1, 0
  br i1 %tmp, label %bb2, label %bb3
@ -285,7 +285,7 @@ bb4:
 ; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0{{$}}
 ; GCN-NEXT: s_setpc_b64 vcc
 ; GCN-NEXT .Lfunc_end{{[0-9]+}}:
-define void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
+define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
 entry:
  br label %loop

@ -342,7 +342,7 @@ loop:
 ; GCN-NEXT: v_nop_e64
 ; GCN-NEXT: ;;#ASMEND
 ; GCN-NEXT: s_endpgm
-define void @expand_requires_expand(i32 %cond0) #0 {
+define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
 bb0:
  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
  %cmp0 = icmp slt i32 %cond0, 0
@ -399,7 +399,7 @@ bb3:
 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
 ; GCN-NEXT: s_sleep 5
 ; GCN-NEXT: s_endpgm
-define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 {
+define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 {
 entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %d_cmp = icmp ult i32 %tid, 16
@ -462,7 +462,7 @@ endif:
 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
 ; GCN: buffer_store_dword
 ; GCN-NEXT: s_endpgm
-define void @analyze_mask_branch() #0 {
+define amdgpu_kernel void @analyze_mask_branch() #0 {
 entry:
  %reg = call float asm sideeffect "v_mov_b32_e64 $0, 0", "=v"()
  %cmp0 = fcmp ogt float %reg, 0.000000e+00
--- a/test/CodeGen/AMDGPU/bswap.ll
+++ b/test/CodeGen/AMDGPU/bswap.ll
@ -17,7 +17,7 @@ declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone
 ; SI: v_bfi_b32 [[RESULT:v[0-9]+]], [[K]], [[TMP1]], [[TMP0]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
  %val = load i32, i32 addrspace(1)* %in, align 4
  %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone
  store i32 %bswap, i32 addrspace(1)* %out, align 4
@ -32,7 +32,7 @@ define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_bfi_b32
 ; SI: s_endpgm
-define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
  %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone
  store <2 x i32> %bswap, <2 x i32> addrspace(1)* %out, align 8
@ -53,7 +53,7 @@ define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_bfi_b32
 ; SI: s_endpgm
-define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind {
  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
  %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone
  store <4 x i32> %bswap, <4 x i32> addrspace(1)* %out, align 16
@ -86,7 +86,7 @@ define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_bfi_b32
 ; SI: s_endpgm
-define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind {
  %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
  %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone
  store <8 x i32> %bswap, <8 x i32> addrspace(1)* %out, align 32
@ -95,21 +95,21 @@ define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(

 ; FUNC-LABEL: {{^}}test_bswap_i64:
 ; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0
-define void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
  %val = load i64, i64 addrspace(1)* %in, align 8
  %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
  store i64 %bswap, i64 addrspace(1)* %out, align 8
  ret void
 }

-define void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind {
  %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
  %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone
  store <2 x i64> %bswap, <2 x i64> addrspace(1)* %out, align 16
  ret void
 }

-define void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind {
  %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
  %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone
  store <4 x i64> %bswap, <4 x i64> addrspace(1)* %out, align 32
--- a/test/CodeGen/AMDGPU/build_vector.ll
+++ b/test/CodeGen/AMDGPU/build_vector.ll
@ -10,7 +10,7 @@
 ; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
 ; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
 ; SI: buffer_store_dwordx2 v{{\[}}[[X]]:[[Y]]{{\]}}
-define void @build_vector2 (<2 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @build_vector2 (<2 x i32> addrspace(1)* %out) {
 entry:
  store <2 x i32> <i32 5, i32 6>, <2 x i32> addrspace(1)* %out
  ret void
@ -28,7 +28,7 @@ entry:
 ; SI-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7
 ; SI-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8
 ; SI: buffer_store_dwordx4 v{{\[}}[[X]]:[[W]]{{\]}}
-define void @build_vector4 (<4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @build_vector4 (<4 x i32> addrspace(1)* %out) {
 entry:
  store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32> addrspace(1)* %out
  ret void
--- a/test/CodeGen/AMDGPU/call.ll
+++ b/test/CodeGen/AMDGPU/call.ll
@ -10,7 +10,7 @@

 declare i32 @external_function(i32) nounwind

-define void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
  %a = load i32, i32 addrspace(1)* %in
  %b = load i32, i32 addrspace(1)* %b_ptr
@ -25,7 +25,7 @@ define i32 @defined_function(i32 %x) nounwind noinline {
  ret i32 %y
 }

-define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
  %a = load i32, i32 addrspace(1)* %in
  %b = load i32, i32 addrspace(1)* %b_ptr
@ -35,7 +35,7 @@ define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
  ret void
 }

-define void @test_tail_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_tail_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
  %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
  %a = load i32, i32 addrspace(1)* %in
  %b = load i32, i32 addrspace(1)* %b_ptr
--- a/test/CodeGen/AMDGPU/captured-frame-index.ll
+++ b/test/CodeGen/AMDGPU/captured-frame-index.ll
@ -3,7 +3,7 @@
 ; GCN-LABEL: {{^}}store_fi_lifetime:
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI]]
-define void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
  %b = alloca i8
  call void @llvm.lifetime.start(i64 1, i8* %b)
@ -18,7 +18,7 @@ entry:
 ; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 4{{$}}
 ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
 ; GCN: ds_write_b32  [[VLDSPTR]], [[ZERO0]]
-define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
  %tmp = alloca float
  store float 4.0, float *%tmp
  store float* %tmp, float* addrspace(3)* %ptr
@ -38,7 +38,7 @@ define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {

 ; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
 ; GCN: ds_write_b32  [[VLDSPTR]], [[FI1]]
-define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
  %tmp0 = alloca float
  %tmp1 = alloca float
  store float 4.0, float* %tmp0
@ -54,7 +54,7 @@ define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
 ; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
 ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
-define void @stored_fi_to_self() #0 {
+define amdgpu_kernel void @stored_fi_to_self() #0 {
  %tmp = alloca i32*

  ; Avoid optimizing everything out
@ -73,7 +73,7 @@ define void @stored_fi_to_self() #0 {

 ; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x804{{$}}
 ; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}}
-define void @stored_fi_to_self_offset() #0 {
+define amdgpu_kernel void @stored_fi_to_self_offset() #0 {
  %tmp0 = alloca [512 x i32]
  %tmp1 = alloca i32*

@ -98,7 +98,7 @@ define void @stored_fi_to_self_offset() #0 {

 ; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
 ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
-define void @stored_fi_to_fi() #0 {
+define amdgpu_kernel void @stored_fi_to_fi() #0 {
  %tmp0 = alloca i32*
  %tmp1 = alloca i32*
  %tmp2 = alloca i32*
@ -118,7 +118,7 @@ define void @stored_fi_to_fi() #0 {
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI]]
-define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {
  %tmp = alloca float
  store float 0.0, float *%tmp
  store float* %tmp, float* addrspace(1)* %ptr
@ -136,7 +136,7 @@ define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {

 ; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
 ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
  %tmp0 = alloca float
  %tmp1 = alloca float
  %tmp2 = alloca float
@ -163,7 +163,7 @@ define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
 ; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_1]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}

 ; GCN: buffer_store_dword [[BASE_1_OFF_2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
  %tmp0 = alloca [4096 x i32]
  %tmp1 = alloca [4096 x i32]
  %gep0.tmp0 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 0
@ -186,7 +186,7 @@ define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC_HI]], g1@gotpcrel32@hi+4
 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI]]
-define void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 {
+define amdgpu_kernel void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 {
 entry:
  %b = alloca i32, align 4
  %tmp1 = load volatile i32*, i32* addrspace(1)* @g1, align 4
--- a/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ b/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@ -7,7 +7,7 @@
 ; GCN: ds_write_b32
 ; GCN: s_branch [[LABEL]]
 ; GCN: s_endpgm
-define void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
  %cmp = icmp eq i32 %n, -1
  br i1 %cmp, label %for.exit, label %for.body
@ -31,7 +31,7 @@ for.body:
 ; GCN: ds_read_b32
 ; GCN: ds_write_b32
 ; GCN: s_branch [[LABEL]]
-define void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
  br label %for.body

@ -52,7 +52,7 @@ for.body:
 ; GCN-LABEL: {{^}}loop_const_false:
 ; GCN-NOT: s_branch
 ; GCN: s_endpgm
-define void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
  br label %for.body

@ -74,7 +74,7 @@ for.body:
 ; GCN-LABEL: {{^}}loop_const_undef:
 ; GCN-NOT: s_branch
 ; GCN: s_endpgm
-define void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
  br label %for.body

@ -104,7 +104,7 @@ for.body:
 ; GCN: s_cbranch_vccnz [[LOOPBB]]
 ; GCN-NEXT: ; BB#2
 ; GCN-NEXT: s_endpgm
-define void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind {
+define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind {
 entry:
  br label %for.body

--- a/test/CodeGen/AMDGPU/cf-stack-bug.ll
+++ b/test/CodeGen/AMDGPU/cf-stack-bug.ll
@ -35,7 +35,7 @@
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested3:
-define void @nested3(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested3(i32 addrspace(1)* %out, i32 %cond) {
 entry:
  %0 = icmp sgt i32 %cond, 0
  br i1 %0, label %if.1, label %end
@ -68,7 +68,7 @@ end:
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested4:
-define void @nested4(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested4(i32 addrspace(1)* %out, i32 %cond) {
 entry:
  %0 = icmp sgt i32 %cond, 0
  br i1 %0, label %if.1, label %end
@ -109,7 +109,7 @@ end:
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested7:
-define void @nested7(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested7(i32 addrspace(1)* %out, i32 %cond) {
 entry:
  %0 = icmp sgt i32 %cond, 0
  br i1 %0, label %if.1, label %end
@ -174,7 +174,7 @@ end:
 ; BUG32: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested8:
-define void @nested8(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested8(i32 addrspace(1)* %out, i32 %cond) {
 entry:
  %0 = icmp sgt i32 %cond, 0
  br i1 %0, label %if.1, label %end
--- a/test/CodeGen/AMDGPU/cf_end.ll
+++ b/test/CodeGen/AMDGPU/cf_end.ll
@ -4,6 +4,6 @@

 ; EG: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x80]
 ; CM: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x88]
-define void @eop() {
+define amdgpu_kernel void @eop() {
  ret void
 }
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
@ -11,7 +11,7 @@
 ; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32:
 ; GCN: flat_load_dword
 ; GCN: {{^}}BB0_2:
-define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+define amdgpu_kernel void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
@ -43,7 +43,7 @@ done:

 ; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_global_i32:
 ; CI: buffer_load_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
-define void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
@ -76,7 +76,7 @@ done:

 ; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_constant_i32:
 ; CI: s_load_dword {{s[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-define void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
  %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@ -15,7 +15,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:

 ; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
 ; GCN: {{^}}BB0_2:
-define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
  %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7
@ -45,7 +45,7 @@ done:
 ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
 ; GCN: {{^}}BB1_2:
 ; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
@ -72,7 +72,7 @@ done:
 ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
 ; GCN: {{^}}BB2_2:
 ; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095
@ -99,7 +99,7 @@ done:
 ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
 ; GCN: {{^}}BB3_2:
 ; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
  %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096
@ -131,7 +131,7 @@ done:
 ; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}}
 ; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}}
 ; GCN: {{^}}BB4_2:
-define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
+define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
 entry:
  %alloca = alloca [512 x i32], align 4
  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
@ -172,7 +172,7 @@ done:
 ; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
 ; GCN: {{^BB[0-9]+}}_2:

-define void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
+define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
 entry:
  %alloca = alloca [512 x i32], align 4
  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
@ -209,7 +209,7 @@ done:
 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 ; GCN: {{^BB[0-9]+}}_2:
-define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
+define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
 entry:
  %alloca = alloca [512 x i32], align 4
  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
@ -241,7 +241,7 @@ done:
 ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
 ; GCN: {{^BB[0-9]+}}_2:
-define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
+define amdgpu_kernel void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
 entry:
  %offset.ext = zext i32 %offset to i64
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
@ -271,7 +271,7 @@ done:
 ; GCN: s_and_saveexec_b64
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 7
@ -300,7 +300,7 @@ done:
 ; GCN: s_and_saveexec_b64
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 255
@ -333,7 +333,7 @@ done:

 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 256
@ -365,7 +365,7 @@ done:
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}}
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 4294967295
@ -396,7 +396,7 @@ done:
 ; GCN: s_addc_u32
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 17179869181
@ -426,7 +426,7 @@ done:
 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}}

 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262143
@ -464,7 +464,7 @@ done:
 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}

 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
  %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262144
@ -494,7 +494,7 @@ done:
 ; GCN: s_load_dword [[SREG1:s[0-9]+]],
 ; GCN: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
 ; GCN-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5
-define void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
+define amdgpu_kernel void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
 entry:
  %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
  %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2
@ -521,7 +521,7 @@ bb34:
 ; OPT: if:
 ; OPT: %sunkaddr = ptrtoint i8 addrspace(2)* %in to i64
 ; OPT: %sunkaddr1 = add i64 %sunkaddr, 4095
-define void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
  %in.gep = getelementptr i8, i8 addrspace(2)* %in, i64 4095
@ -548,7 +548,7 @@ done:
 ; OPT: %sunkaddr1 = add i32 %sunkaddr, 28
 ; OPT: %sunkaddr2 = inttoptr i32 %sunkaddr1 to i32 addrspace(3)*
 ; OPT: %tmp1 = atomicrmw add i32 addrspace(3)* %sunkaddr2, i32 2 seq_cst
-define void @test_sink_local_small_offset_atomicrmw_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @test_sink_local_small_offset_atomicrmw_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
@ -574,7 +574,7 @@ done:
 ; OPT: %sunkaddr1 = add i32 %sunkaddr, 28
 ; OPT: %sunkaddr2 = inttoptr i32 %sunkaddr1 to i32 addrspace(3)*
 ; OPT: %tmp1.struct = cmpxchg i32 addrspace(3)* %sunkaddr2, i32 undef, i32 2 seq_cst monotonic
-define void @test_sink_local_small_offset_cmpxchg_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @test_sink_local_small_offset_cmpxchg_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
@ -600,7 +600,7 @@ done:
 ; OPT: %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
 ; OPT: br i1
 ; OPT: cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic
-define void @test_wrong_operand_local_small_offset_cmpxchg_i32(i32 addrspace(3)* addrspace(3)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @test_wrong_operand_local_small_offset_cmpxchg_i32(i32 addrspace(3)* addrspace(3)* %out, i32 addrspace(3)* %in) {
 entry:
  %out.gep = getelementptr i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* %out, i32 999999
  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
@ -627,7 +627,7 @@ done:
 ; OPT: %sunkaddr1 = add i32 %sunkaddr, 28
 ; OPT: %sunkaddr2 = inttoptr i32 %sunkaddr1 to i32 addrspace(3)*
 ; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %sunkaddr2, i32 2)
-define void @test_sink_local_small_offset_atomic_inc_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @test_sink_local_small_offset_atomic_inc_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
@ -653,7 +653,7 @@ done:
 ; OPT: %sunkaddr1 = add i32 %sunkaddr, 28
 ; OPT: %sunkaddr2 = inttoptr i32 %sunkaddr1 to i32 addrspace(3)*
 ; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %sunkaddr2, i32 2)
-define void @test_sink_local_small_offset_atomic_dec_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @test_sink_local_small_offset_atomic_dec_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
 entry:
  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
--- a/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
+++ b/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
@ -36,7 +36,7 @@
 ; GCN: BB0_3:
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @sink_ubfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
 entry:
  %shr = lshr i32 %arg1, 8
  br i1 undef, label %bb0, label %bb1
@ -76,7 +76,7 @@ ret:
 ; OPT: ret

 ; GCN-LABEL: {{^}}sink_sbfe_i32:
-define void @sink_sbfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
+define amdgpu_kernel void @sink_sbfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
 entry:
  %shr = ashr i32 %arg1, 8
  br i1 undef, label %bb0, label %bb1
@ -134,7 +134,7 @@ ret:
 ; GCN: BB2_3:
 ; GCN: buffer_store_short
 ; GCN: s_endpgm
-define void @sink_ubfe_i16(i16 addrspace(1)* %out, i16 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i16(i16 addrspace(1)* %out, i16 %arg1) #0 {
 entry:
  %shr = lshr i16 %arg1, 4
  br i1 undef, label %bb0, label %bb1
@ -187,7 +187,7 @@ ret:

 ; GCN: BB3_3:
 ; GCN: buffer_store_dwordx2
-define void @sink_ubfe_i64_span_midpoint(i64 addrspace(1)* %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_span_midpoint(i64 addrspace(1)* %out, i64 %arg1) #0 {
 entry:
  %shr = lshr i64 %arg1, 30
  br i1 undef, label %bb0, label %bb1
@ -236,7 +236,7 @@ ret:

 ; GCN: BB4_3:
 ; GCN: buffer_store_dwordx2
-define void @sink_ubfe_i64_low32(i64 addrspace(1)* %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_low32(i64 addrspace(1)* %out, i64 %arg1) #0 {
 entry:
  %shr = lshr i64 %arg1, 15
  br i1 undef, label %bb0, label %bb1
@ -283,7 +283,7 @@ ret:

 ; GCN: BB5_3:
 ; GCN: buffer_store_dwordx2
-define void @sink_ubfe_i64_high32(i64 addrspace(1)* %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_high32(i64 addrspace(1)* %out, i64 %arg1) #0 {
 entry:
  %shr = lshr i64 %arg1, 35
  br i1 undef, label %bb0, label %bb1
--- a/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
+++ b/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
@ -8,7 +8,7 @@ declare i1 @llvm.amdgcn.class.f32(float, i32)
 ; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
-define void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) {
+define amdgpu_kernel void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) {
 bb0:
  %tmp = icmp sgt i32 %arg1, 4
  %c = icmp eq i32 %arg3, 0
@ -35,7 +35,7 @@ bb2:
 ; GCN-NOT: vcc
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
-define void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
+define amdgpu_kernel void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
 bb0:
  %tmp = icmp sgt i32 %arg1, 4
  %undef = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef)
--- a/test/CodeGen/AMDGPU/coalescer_remat.ll
+++ b/test/CodeGen/AMDGPU/coalescer_remat.ll
@ -13,7 +13,7 @@ declare float @llvm.fma.f32(float, float, float)
 ; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
 ; It's probably OK if this is slightly higher:
 ; CHECK: ; NumVgprs: 8
-define void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
+define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
 entry:
  %cmpflag = icmp eq i32 %flag, 1
  br i1 %cmpflag, label %loop, label %exit
--- a/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
+++ b/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
@ -8,7 +8,7 @@
 ; SI-LLC-LABEL: {{^}}test:
 ; SI-LLC: s_mul_i32
 ; SI-LLC-NOT: mul
-define void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) {
+define amdgpu_kernel void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) {
 entry:
  %0 = mul nsw i32 %a, 3
  %1 = sext i32 %0 to i64
--- a/test/CodeGen/AMDGPU/combine_vloads.ll
+++ b/test/CodeGen/AMDGPU/combine_vloads.ll
@ -12,7 +12,7 @@
 ; EG-LABEL: {{^}}combine_vloads:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind {
+define amdgpu_kernel void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind {
 entry:
  br label %for.body

--- a/test/CodeGen/AMDGPU/commute-compares.ll
+++ b/test/CodeGen/AMDGPU/commute-compares.ll
@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0

 ; GCN-LABEL: {{^}}commute_eq_64_i32:
 ; GCN: v_cmp_eq_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -21,7 +21,7 @@ define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1

 ; GCN-LABEL: {{^}}commute_ne_64_i32:
 ; GCN: v_cmp_ne_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -36,7 +36,7 @@ define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1
 ; GCN-LABEL: {{^}}commute_ne_litk_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3039
 ; GCN: v_cmp_ne_u32_e32 vcc, [[K]], v{{[0-9]+}}
-define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -49,7 +49,7 @@ define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in)

 ; GCN-LABEL: {{^}}commute_ugt_64_i32:
 ; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -62,7 +62,7 @@ define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #

 ; GCN-LABEL: {{^}}commute_uge_64_i32:
 ; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}}
-define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -75,7 +75,7 @@ define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #

 ; GCN-LABEL: {{^}}commute_ult_64_i32:
 ; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -88,7 +88,7 @@ define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #

 ; GCN-LABEL: {{^}}commute_ule_63_i32:
 ; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -104,7 +104,7 @@ define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
 ; GCN-LABEL: {{^}}commute_ule_64_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x41{{$}}
 ; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}}
-define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -117,7 +117,7 @@ define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #

 ; GCN-LABEL: {{^}}commute_sgt_neg1_i32:
 ; GCN: v_cmp_lt_i32_e32 vcc, -1, v{{[0-9]+}}
-define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -130,7 +130,7 @@ define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in)

 ; GCN-LABEL: {{^}}commute_sge_neg2_i32:
 ; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}}
-define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -143,7 +143,7 @@ define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in)

 ; GCN-LABEL: {{^}}commute_slt_neg16_i32:
 ; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}}
-define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -156,7 +156,7 @@ define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in

 ; GCN-LABEL: {{^}}commute_sle_5_i32:
 ; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}}
-define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -173,7 +173,7 @@ define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1

 ; GCN-LABEL: {{^}}commute_eq_64_i64:
 ; GCN: v_cmp_eq_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -186,7 +186,7 @@ define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1

 ; GCN-LABEL: {{^}}commute_ne_64_i64:
 ; GCN: v_cmp_ne_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -199,7 +199,7 @@ define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1

 ; GCN-LABEL: {{^}}commute_ugt_64_i64:
 ; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -212,7 +212,7 @@ define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #

 ; GCN-LABEL: {{^}}commute_uge_64_i64:
 ; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -225,7 +225,7 @@ define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #

 ; GCN-LABEL: {{^}}commute_ult_64_i64:
 ; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -238,7 +238,7 @@ define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #

 ; GCN-LABEL: {{^}}commute_ule_63_i64:
 ; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -254,7 +254,7 @@ define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
 ; GCN-LABEL: {{^}}commute_ule_64_i64:
 ; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}}
 ; GCN: v_cmp_gt_u64_e32 vcc, s{{\[}}[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -267,7 +267,7 @@ define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #

 ; GCN-LABEL: {{^}}commute_sgt_neg1_i64:
 ; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -280,7 +280,7 @@ define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)

 ; GCN-LABEL: {{^}}commute_sge_neg2_i64:
 ; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -293,7 +293,7 @@ define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)

 ; GCN-LABEL: {{^}}commute_slt_neg16_i64:
 ; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -306,7 +306,7 @@ define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in

 ; GCN-LABEL: {{^}}commute_sle_5_i64:
 ; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -324,7 +324,7 @@ define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1

 ; GCN-LABEL: {{^}}commute_oeq_2.0_f32:
 ; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -338,7 +338,7 @@ define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in

 ; GCN-LABEL: {{^}}commute_ogt_2.0_f32:
 ; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -351,7 +351,7 @@ define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in

 ; GCN-LABEL: {{^}}commute_oge_2.0_f32:
 ; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -364,7 +364,7 @@ define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in

 ; GCN-LABEL: {{^}}commute_olt_2.0_f32:
 ; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -377,7 +377,7 @@ define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in

 ; GCN-LABEL: {{^}}commute_ole_2.0_f32:
 ; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -390,7 +390,7 @@ define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in

 ; GCN-LABEL: {{^}}commute_one_2.0_f32:
 ; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -403,7 +403,7 @@ define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in

 ; GCN-LABEL: {{^}}commute_ord_2.0_f32:
 ; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
-define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -416,7 +416,7 @@ define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in

 ; GCN-LABEL: {{^}}commute_ueq_2.0_f32:
 ; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -429,7 +429,7 @@ define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in

 ; GCN-LABEL: {{^}}commute_ugt_2.0_f32:
 ; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -442,7 +442,7 @@ define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in

 ; GCN-LABEL: {{^}}commute_uge_2.0_f32:
 ; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -455,7 +455,7 @@ define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in

 ; GCN-LABEL: {{^}}commute_ult_2.0_f32:
 ; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -468,7 +468,7 @@ define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in

 ; GCN-LABEL: {{^}}commute_ule_2.0_f32:
 ; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -481,7 +481,7 @@ define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in

 ; GCN-LABEL: {{^}}commute_une_2.0_f32:
 ; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -494,7 +494,7 @@ define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in

 ; GCN-LABEL: {{^}}commute_uno_2.0_f32:
 ; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
-define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -512,7 +512,7 @@ define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in

 ; GCN-LABEL: {{^}}commute_oeq_2.0_f64:
 ; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -526,7 +526,7 @@ define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i

 ; GCN-LABEL: {{^}}commute_ogt_2.0_f64:
 ; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -539,7 +539,7 @@ define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i

 ; GCN-LABEL: {{^}}commute_oge_2.0_f64:
 ; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -552,7 +552,7 @@ define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i

 ; GCN-LABEL: {{^}}commute_olt_2.0_f64:
 ; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -565,7 +565,7 @@ define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i

 ; GCN-LABEL: {{^}}commute_ole_2.0_f64:
 ; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -578,7 +578,7 @@ define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i

 ; GCN-LABEL: {{^}}commute_one_2.0_f64:
 ; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -591,7 +591,7 @@ define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i

 ; GCN-LABEL: {{^}}commute_ord_2.0_f64:
 ; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
-define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -604,7 +604,7 @@ define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i

 ; GCN-LABEL: {{^}}commute_ueq_2.0_f64:
 ; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -617,7 +617,7 @@ define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i

 ; GCN-LABEL: {{^}}commute_ugt_2.0_f64:
 ; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -630,7 +630,7 @@ define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i

 ; GCN-LABEL: {{^}}commute_uge_2.0_f64:
 ; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -643,7 +643,7 @@ define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i

 ; GCN-LABEL: {{^}}commute_ult_2.0_f64:
 ; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -656,7 +656,7 @@ define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i

 ; GCN-LABEL: {{^}}commute_ule_2.0_f64:
 ; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -669,7 +669,7 @@ define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i

 ; GCN-LABEL: {{^}}commute_une_2.0_f64:
 ; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -682,7 +682,7 @@ define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i

 ; GCN-LABEL: {{^}}commute_uno_2.0_f64:
 ; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
-define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
  %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
  %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -703,7 +703,7 @@ define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i

 ; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: v_cmp_eq_u32_e32 vcc, [[FI]], v{{[0-9]+}}
-define void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 {
 entry:
  %stack0 = alloca i32
  %ptr0 = load volatile i32*, i32* addrspace(1)* undef
--- a/test/CodeGen/AMDGPU/commute_modifiers.ll
+++ b/test/CodeGen/AMDGPU/commute_modifiers.ll
@ -8,7 +8,7 @@ declare float @llvm.fma.f32(float, float, float) nounwind readnone
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, 2.0
 ; SI: buffer_store_dword [[REG]]
-define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
  %x = load float, float addrspace(1)* %gep.0
@ -22,7 +22,7 @@ define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -4.0
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
  %x = load float, float addrspace(1)* %gep.0
@ -37,7 +37,7 @@ define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrs
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]]
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
  %x = load float, float addrspace(1)* %gep.0
@ -53,7 +53,7 @@ define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(
 ; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[K]], |[[X]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
  %x = load float, float addrspace(1)* %gep.0
@ -68,7 +68,7 @@ define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@ -85,7 +85,7 @@ define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)*
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]]
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@ -102,7 +102,7 @@ define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)*
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@ -121,7 +121,7 @@ define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@ -139,7 +139,7 @@ define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrs
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
  %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@ -161,7 +161,7 @@ define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float
 ; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, |[[R2]]|
 ; SI: buffer_store_dword [[RESULT]]
-define void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
  %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
  %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
--- a/test/CodeGen/AMDGPU/concat_vectors.ll
+++ b/test/CodeGen/AMDGPU/concat_vectors.ll
@ -8,7 +8,7 @@
 ; value if we want to ensure scratch memory is not being used.
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
  %concat = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> <i32 0, i32 1>
  store <2 x i32> %concat, <2 x i32> addrspace(1)* %out, align 8
  ret void
@ -17,7 +17,7 @@ define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x
 ; FUNC-LABEL: {{^}}test_concat_v2i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
  %concat = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  store <4 x i32> %concat, <4 x i32> addrspace(1)* %out, align 16
  ret void
@ -26,7 +26,7 @@ define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x
 ; FUNC-LABEL: {{^}}test_concat_v4i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
  %concat = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x i32> %concat, <8 x i32> addrspace(1)* %out, align 32
  ret void
@ -35,7 +35,7 @@ define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x
 ; FUNC-LABEL: {{^}}test_concat_v8i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
  %concat = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  store <16 x i32> %concat, <16 x i32> addrspace(1)* %out, align 64
  ret void
@ -44,7 +44,7 @@ define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x
 ; FUNC-LABEL: {{^}}test_concat_v16i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind {
  %concat = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  store <32 x i32> %concat, <32 x i32> addrspace(1)* %out, align 128
  ret void
@ -53,7 +53,7 @@ define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <1
 ; FUNC-LABEL: {{^}}test_concat_v1f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind {
  %concat = shufflevector <1 x float> %a, <1 x float> %b, <2 x i32> <i32 0, i32 1>
  store <2 x float> %concat, <2 x float> addrspace(1)* %out, align 8
  ret void
@ -62,7 +62,7 @@ define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <
 ; FUNC-LABEL: {{^}}test_concat_v2f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
  %concat = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  store <4 x float> %concat, <4 x float> addrspace(1)* %out, align 16
  ret void
@ -71,7 +71,7 @@ define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <
 ; FUNC-LABEL: {{^}}test_concat_v4f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
  %concat = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x float> %concat, <8 x float> addrspace(1)* %out, align 32
  ret void
@ -80,7 +80,7 @@ define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <
 ; FUNC-LABEL: {{^}}test_concat_v8f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
  %concat = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  store <16 x float> %concat, <16 x float> addrspace(1)* %out, align 64
  ret void
@ -89,7 +89,7 @@ define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a,
 ; FUNC-LABEL: {{^}}test_concat_v16f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
  %concat = shufflevector <16 x float> %a, <16 x float> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  store <32 x float> %concat, <32 x float> addrspace(1)* %out, align 128
  ret void
@ -98,7 +98,7 @@ define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a
 ; FUNC-LABEL: {{^}}test_concat_v1i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
  %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
  store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
  ret void
@ -107,7 +107,7 @@ define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v2i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
  %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
  ret void
@ -116,7 +116,7 @@ define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v4i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
  %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
  ret void
@ -125,7 +125,7 @@ define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v8i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
  %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
  ret void
@ -134,7 +134,7 @@ define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a
 ; FUNC-LABEL: {{^}}test_concat_v16i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
  %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
  ret void
@ -143,7 +143,7 @@ define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double>
 ; FUNC-LABEL: {{^}}test_concat_v1f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
  %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
  store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
  ret void
@ -152,7 +152,7 @@ define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v2f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
  %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
  ret void
@ -161,7 +161,7 @@ define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v4f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
  %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
  ret void
@ -170,7 +170,7 @@ define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v8f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
  %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
  ret void
@ -179,7 +179,7 @@ define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a
 ; FUNC-LABEL: {{^}}test_concat_v16f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
  %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
  ret void
@ -188,7 +188,7 @@ define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double>
 ; FUNC-LABEL: {{^}}test_concat_v1i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind {
  %concat = shufflevector <1 x i1> %a, <1 x i1> %b, <2 x i32> <i32 0, i32 1>
  store <2 x i1> %concat, <2 x i1> addrspace(1)* %out
  ret void
@ -197,7 +197,7 @@ define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1>
 ; FUNC-LABEL: {{^}}test_concat_v2i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind {
  %concat = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  store <4 x i1> %concat, <4 x i1> addrspace(1)* %out
  ret void
@ -206,7 +206,7 @@ define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1>
 ; FUNC-LABEL: {{^}}test_concat_v4i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind {
  %concat = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x i1> %concat, <8 x i1> addrspace(1)* %out
  ret void
@ -215,7 +215,7 @@ define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1>
 ; FUNC-LABEL: {{^}}test_concat_v8i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind {
  %concat = shufflevector <8 x i1> %a, <8 x i1> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  store <16 x i1> %concat, <16 x i1> addrspace(1)* %out
  ret void
@ -224,7 +224,7 @@ define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1
 ; FUNC-LABEL: {{^}}test_concat_v16i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind {
  %concat = shufflevector <16 x i1> %a, <16 x i1> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  store <32 x i1> %concat, <32 x i1> addrspace(1)* %out
  ret void
@ -233,7 +233,7 @@ define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x
 ; FUNC-LABEL: {{^}}test_concat_v32i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind {
  %concat = shufflevector <32 x i1> %a, <32 x i1> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
  store <64 x i1> %concat, <64 x i1> addrspace(1)* %out
  ret void
@ -242,7 +242,7 @@ define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x
 ; FUNC-LABEL: {{^}}test_concat_v1i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind {
  %concat = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> <i32 0, i32 1>
  store <2 x i16> %concat, <2 x i16> addrspace(1)* %out, align 4
  ret void
@ -251,7 +251,7 @@ define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x
 ; FUNC-LABEL: {{^}}test_concat_v2i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind {
  %concat = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
  store <4 x i16> %concat, <4 x i16> addrspace(1)* %out, align 8
  ret void
@ -260,7 +260,7 @@ define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x
 ; FUNC-LABEL: {{^}}test_concat_v4i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
  %concat = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
  store <8 x i16> %concat, <8 x i16> addrspace(1)* %out, align 16
  ret void
@ -269,7 +269,7 @@ define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x
 ; FUNC-LABEL: {{^}}test_concat_v8i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
  %concat = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
  store <16 x i16> %concat, <16 x i16> addrspace(1)* %out, align 32
  ret void
@ -278,7 +278,7 @@ define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x
 ; FUNC-LABEL: {{^}}test_concat_v16i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind {
  %concat = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
  store <32 x i16> %concat, <32 x i16> addrspace(1)* %out, align 64
  ret void
@ -286,7 +286,7 @@ define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <1

 ; FUNC-LABEL: {{^}}concat_vector_crash:
 ; SI: s_endpgm
-define void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
 bb:
  %tmp = load <2 x float>, <2 x float> addrspace(1)* %in, align 4
  %tmp1 = shufflevector <2 x float> %tmp, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
--- a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
+++ b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
@ -1,12 +1,12 @@
 # RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination -o - %s | FileCheck -check-prefix=GCN %s
 --- |
-  define void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  define amdgpu_kernel void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
    %and = and i32 %a, 1234567
    store volatile i32 %and, i32 addrspace(1)* %out
    ret void
  }

-  define void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
    %tid = call i32 @llvm.amdgcn.workitem.id.x()
    %idxprom = sext i32 %tid to i64
    %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
@ -17,13 +17,13 @@
    ret void
  }

-  define void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  define amdgpu_kernel void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
    %shl = shl i32 %a, 12
    store volatile i32 %shl, i32 addrspace(1)* %out
    ret void
  }

-  define void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
    %tid = call i32 @llvm.amdgcn.workitem.id.x()
    %idxprom = sext i32 %tid to i64
    %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
@ -34,13 +34,13 @@
    ret void
  }

-  define void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  define amdgpu_kernel void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
    %ashr = ashr i32 %a, 12
    store volatile i32 %ashr, i32 addrspace(1)* %out
    ret void
  }

-  define void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
    %tid = call i32 @llvm.amdgcn.workitem.id.x()
    %idxprom = sext i32 %tid to i64
    %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
@ -51,13 +51,13 @@
    ret void
  }

-   define void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+   define amdgpu_kernel void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
    %lshr = lshr i32 %a, 12
    store volatile i32 %lshr, i32 addrspace(1)* %out
    ret void
  }

-  define void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
    %tid = call i32 @llvm.amdgcn.workitem.id.x()
    %idxprom = sext i32 %tid to i64
    %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
--- a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
+++ b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
@ -5,7 +5,7 @@
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_v_and_0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_and_0(i32 addrspace(1)* %out) {
  %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
  %size = call i32 @llvm.amdgcn.groupstaticsize()
  %and = and i32 %size, %x
@ -17,7 +17,7 @@ define void @fold_mi_v_and_0(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
  %size = call i32 @llvm.amdgcn.groupstaticsize()
  %and = and i32 %size, %x
  store i32 %and, i32 addrspace(1)* %out
@ -28,7 +28,7 @@ define void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
 ; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
  %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
  %size = call i32 @llvm.amdgcn.groupstaticsize()
  %or = or i32 %size, %x
@ -42,7 +42,7 @@ define void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
  %size = call i32 @llvm.amdgcn.groupstaticsize()
  %or = or i32 %size, %x
  store i32 %or, i32 addrspace(1)* %out
@ -53,7 +53,7 @@ define void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
 ; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
  %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
  %size = call i32 @llvm.amdgcn.groupstaticsize()
  %xor = xor i32 %size, %x
@ -67,7 +67,7 @@ define void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 {
  %size = call i32 @llvm.amdgcn.groupstaticsize()
  %xor = xor i32 %size, %x
  store i32 %xor, i32 addrspace(1)* %out
@ -78,7 +78,7 @@ define void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 {
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], -1{{$}}
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
  %size = call i32 @llvm.amdgcn.groupstaticsize()
  %xor = xor i32 %size, -1
  store i32 %xor, i32 addrspace(1)* %out
@ -91,7 +91,7 @@ define void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
 ; GCN-NEXT: v_not_b32_e32 v[[RESULT_LO]]
 ; GCN-NEXT: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], -1{{$}}
 ; GCN-NEXT: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
  %vreg = load volatile i64, i64 addrspace(1)* undef
  %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg)
  %xor = xor i64 %ctpop, -1
@ -110,7 +110,7 @@ define void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
 ; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
  %vreg0 = load volatile i64, i64 addrspace(1)* undef
  %vreg1 = load volatile i64, i64 addrspace(1)* undef
  %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg0)
@ -126,7 +126,7 @@ define void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
 ; GCN: v_not_b32
 ; GCN: v_and_b32
 ; GCN-NOT: v_and_b32
-define void @fold_mi_and_neg1(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_and_neg1(i64 addrspace(1)* %out) {
  %vreg0 = load volatile i64, i64 addrspace(1)* undef
  %vreg1 = load volatile i64, i64 addrspace(1)* undef
  %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg0)
--- a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@ -72,7 +72,7 @@
 ; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], s7 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload

 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]]
-define void @divergent_if_endif(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @divergent_if_endif(i32 addrspace(1)* %out) #0 {
 entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %load0 = load volatile i32, i32 addrspace(3)* undef
@ -150,7 +150,7 @@ endif:
 ; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload

 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]
-define void @divergent_loop(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @divergent_loop(i32 addrspace(1)* %out) #0 {
 entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %load0 = load volatile i32, i32 addrspace(3)* undef
@ -272,7 +272,7 @@ end:

 ; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]]
-define void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 {
 entry:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %load0 = load volatile i32, i32 addrspace(3)* undef
--- a/test/CodeGen/AMDGPU/convergent-inlineasm.ll
+++ b/test/CodeGen/AMDGPU/convergent-inlineasm.ll
@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 ; GCN: v_cmp_ne_u32_e64
 ; GCN: ; mask branch
 ; GCN: BB{{[0-9]+_[0-9]+}}:
-define void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
 bb:
  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 1) #1
@ -30,7 +30,7 @@ bb5:                                              ; preds = %bb3, %bb

 ; GCN: BB{{[0-9]+_[0-9]+}}:

-define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
 bb:
  %tmp = call i32 @llvm.amdgcn.workitem.id.x()
  %tmp1 = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 1)
--- a/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/test/CodeGen/AMDGPU/copy-illegal-type.ll
@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
 ; GCN: buffer_load_dword [[REG:v[0-9]+]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
  store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
  ret void
@ -19,7 +19,7 @@ define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)*
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
@ -32,7 +32,7 @@ define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
@ -47,7 +47,7 @@ define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
  store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
@ -65,7 +65,7 @@ define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(
 ; GCN-DAG: buffer_store_dword

 ; GCN: s_endpgm
-define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
  %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
  store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
@ -85,7 +85,7 @@ define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> add
 ; GCN: {{buffer|flat}}_store_dword
 ; GCN: {{buffer|flat}}_store_dword
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
  %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
@ -101,7 +101,7 @@ define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8>
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
 ; GCN: s_endpgm
-define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
  %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
  store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
  ret void
@ -113,7 +113,7 @@ define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspa
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
 ; GCN: s_endpgm
-define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
  %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2
  store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2
  ret void
@ -128,7 +128,7 @@ define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspa
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
  %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1
  store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1
  ret void
@ -141,7 +141,7 @@ define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspa
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
  %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
  store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
  ret void
@ -157,7 +157,7 @@ define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8>
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
  %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
  store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
  ret void
--- a/test/CodeGen/AMDGPU/copy-to-reg.ll
+++ b/test/CodeGen/AMDGPU/copy-to-reg.ll
@ -6,7 +6,7 @@

 ; Make sure this doesn't crash
 ; CHECK-LABEL: {{^}}copy_to_reg_frameindex:
-define void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
  %alloca = alloca [16 x i32]
  br label %loop
--- a/test/CodeGen/AMDGPU/ctlz.ll
+++ b/test/CodeGen/AMDGPU/ctlz.ll
@ -27,7 +27,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone

 ; EG: FFBH_UINT
 ; EG: CNDE_INT
-define void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
  store i32 %ctlz, i32 addrspace(1)* %out, align 4
  ret void
@ -43,7 +43,7 @@ define void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {

 ; EG: FFBH_UINT
 ; EG: CNDE_INT
-define void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
  %val = load i32, i32 addrspace(1)* %valptr, align 4
  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
  store i32 %ctlz, i32 addrspace(1)* %out, align 4
@ -61,7 +61,7 @@ define void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalia
 ; EG: CNDE_INT
 ; EG: FFBH_UINT
 ; EG: CNDE_INT
-define void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
  %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
  %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
  store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
@ -89,7 +89,7 @@ define void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrsp

 ; EG-DAG: FFBH_UINT
 ; EG-DAG: CNDE_INT
-define void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
  %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
  %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
  store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
@ -101,7 +101,7 @@ define void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrsp
 ; GCN-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_byte [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
  %val = load i8, i8 addrspace(1)* %valptr
  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
  store i8 %ctlz, i8 addrspace(1)* %out
@ -119,14 +119,14 @@ define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %
 ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
-define void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
  store i64 %ctlz, i64 addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}s_ctlz_i64_trunc:
-define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
  %trunc = trunc i64 %ctlz to i32
  store i32 %trunc, i32 addrspace(1)* %out
@ -145,7 +145,7 @@ define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind
 ; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[OR]]
 ; GCN-DAG: v_cndmask_b32_e32 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], vcc
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}}
-define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
  %tid = call i32 @llvm.r600.read.tidig.x()
  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@ -156,7 +156,7 @@ define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalia
 }

 ; FUNC-LABEL: {{^}}v_ctlz_i64_trunc:
-define void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
  %tid = call i32 @llvm.r600.read.tidig.x()
  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -172,7 +172,7 @@ define void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)*
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
- define void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
  %val = load i32, i32 addrspace(1)* %valptr
  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
  %cmp = icmp eq i32 %val, 0
@ -186,7 +186,7 @@ define void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)*
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
  %val = load i32, i32 addrspace(1)* %valptr
  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
  %cmp = icmp ne i32 %val, 0
@ -202,7 +202,7 @@ define void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspac
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: s_endpgm
-define void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
  %val = load i32, i32 addrspace(1)* %valptr
  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
  %cmp = icmp eq i32 %ctlz, 32
@ -217,7 +217,7 @@ define void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addr
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: s_endpgm
-define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
  %val = load i32, i32 addrspace(1)* %valptr
  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
  %cmp = icmp ne i32 %ctlz, 32
@ -230,7 +230,7 @@ define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addr
 ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; GCN: {{buffer|flat}}_store_byte [[FFBH]],
- define void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
  %tid = call i32 @llvm.r600.read.tidig.x()
  %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
  %val = load i8, i8 addrspace(1)* %valptr.gep
@ -245,7 +245,7 @@ define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addr
 ; SI: buffer_load_ushort [[VAL:v[0-9]+]],
 ; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_short [[FFBH]],
- define void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
  %val = load i16, i16 addrspace(1)* %valptr
  %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone
  %cmp = icmp eq i16 %val, 0
@ -260,7 +260,7 @@ define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addr
 ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; GCN: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7f, [[FFBH]]
 ; GCN: {{buffer|flat}}_store_byte [[TRUNC]],
-define void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
  %tid = call i32 @llvm.r600.read.tidig.x()
  %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid
  %val = load i7, i7 addrspace(1)* %valptr.gep
--- a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@ -22,7 +22,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; GCN: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
  store i32 %ctlz, i32 addrspace(1)* %out, align 4
  ret void
@ -35,7 +35,7 @@ define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nou
 ; GCN: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
  %val = load i32, i32 addrspace(1)* %valptr, align 4
  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
  store i32 %ctlz, i32 addrspace(1)* %out, align 4
@ -51,7 +51,7 @@ define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
  %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
  %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
  store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
@ -71,7 +71,7 @@ define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
  %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
  %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
  store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
@ -82,7 +82,7 @@ define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x
 ; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_byte [[RESULT]],
-define void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
  %val = load i8, i8 addrspace(1)* %valptr
  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
  store i8 %ctlz, i8 addrspace(1)* %out
@ -100,14 +100,14 @@ define void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)
 ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
-define void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
  store i64 %ctlz, i64 addrspace(1)* %out
  ret void
 }

 ; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64_trunc:
-define void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
  %trunc = trunc i64 %ctlz to i32
  store i32 %trunc, i32 addrspace(1)* %out
@ -123,7 +123,7 @@ define void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %va
 ; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
-define void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
  %tid = call i32 @llvm.r600.read.tidig.x()
  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
  %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@ -134,7 +134,7 @@ define void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace
 }

 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64_trunc:
-define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
  %tid = call i32 @llvm.r600.read.tidig.x()
  %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@ -149,7 +149,7 @@ define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 add
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
- define void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
  %val = load i32, i32 addrspace(1)* %valptr
  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
  %cmp = icmp eq i32 %val, 0
@ -162,7 +162,7 @@ define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 add
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
-define void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
  %val = load i32, i32 addrspace(1)* %valptr
  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
  %cmp = icmp ne i32 %val, 0
@ -175,7 +175,7 @@ define void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i
 ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; GCN: {{buffer|flat}}_store_byte [[FFBH]],
-define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
  %tid = call i32 @llvm.r600.read.tidig.x()
  %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
  %val = load i8, i8 addrspace(1)* %valptr.gep
@ -194,7 +194,7 @@ define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8
 ; GCN-DAG: buffer_store_dword [[RESULT0]]
 ; GCN-DAG: buffer_store_byte [[RESULT1]]
 ; GCN: s_endpgm
- define void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
  %val = load i32, i32 addrspace(1)* %valptr
  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
  %cmp = icmp eq i32 %val, 0
@ -211,7 +211,7 @@ define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
- define void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
  %val = load i32, i32 addrspace(1)* %valptr
  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
  %cmp = icmp eq i32 %val, 0
@ -227,7 +227,7 @@ define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
-define void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
  %val = load i32, i32 addrspace(1)* %valptr
  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
  %cmp = icmp ne i32 %val, 0
@ -243,7 +243,7 @@ define void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
- define void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
  %val = load i32, i32 addrspace(1)* %valptr
  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
  %cmp = icmp eq i32 %val, 1
@ -259,7 +259,7 @@ define void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
-define void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
  %val = load i32, i32 addrspace(1)* %valptr
  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
  %cmp = icmp ne i32 %val, 1
--- a/test/CodeGen/AMDGPU/ctpop.ll
+++ b/test/CodeGen/AMDGPU/ctpop.ll
@ -16,7 +16,7 @@ declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone
 ; GCN: s_endpgm

 ; EG: BCNT_INT
-define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
  store i32 %ctpop, i32 addrspace(1)* %out, align 4
  ret void
@ -30,7 +30,7 @@ define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
 ; GCN: s_endpgm

 ; EG: BCNT_INT
-define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
  %val = load i32, i32 addrspace(1)* %in, align 4
  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
  store i32 %ctpop, i32 addrspace(1)* %out, align 4
@ -48,7 +48,7 @@ define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noali

 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
+define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
  %val0 = load i32, i32 addrspace(1)* %in0, align 4
  %val1 = load i32, i32 addrspace(1)* %in1, align 4
  %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
@ -64,7 +64,7 @@ define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace
 ; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
+define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
  %val0 = load i32, i32 addrspace(1)* %in0, align 4
  %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
  %add = add i32 %ctpop0, %sval
@ -79,7 +79,7 @@ define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(

 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
  %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
  %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone
  store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8
@ -97,7 +97,7 @@ define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrs
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
  %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
  %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone
  store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16
@ -123,7 +123,7 @@ define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrs
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
  %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
  %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone
  store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32
@ -165,7 +165,7 @@ define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrs
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
  %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32
  %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone
  store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32
@ -179,7 +179,7 @@ define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> ad
 ; GCN: s_endpgm

 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
  %val = load i32, i32 addrspace(1)* %in, align 4
  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
  %add = add i32 %ctpop, 4
@ -194,7 +194,7 @@ define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32
 ; GCN: s_endpgm

 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
  %val = load i32, i32 addrspace(1)* %in, align 4
  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
  %add = add i32 4, %ctpop
@ -209,7 +209,7 @@ define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out,
 ; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
  %val = load i32, i32 addrspace(1)* %in, align 4
  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
  %add = add i32 %ctpop, 99999
@ -225,7 +225,7 @@ define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspa
 ; GCN: s_endpgm

 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
  %val = load i32, i32 addrspace(1)* %in, align 4
  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
  %add = add i32 %ctpop, %const
@ -241,7 +241,7 @@ define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1
 ; GCN: s_endpgm

 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
  %val = load i32, i32 addrspace(1)* %in, align 4
  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
  %add = add i32 %const, %ctpop
@ -258,7 +258,7 @@ define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspa
 ; GCN: s_endpgm

 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
  %val = load i32, i32 addrspace(1)* %in, align 4
  %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
  %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4
@ -279,7 +279,7 @@ define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrsp
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 ; EG: BCNT_INT
-define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
+define amdgpu_kernel void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
 entry:
  %tmp0 = icmp eq i32 %cond, 0
  br i1 %tmp0, label %if, label %else
--- a/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/test/CodeGen/AMDGPU/ctpop64.ll
@ -17,7 +17,7 @@ declare i128 @llvm.ctpop.i128(i128) nounwind readnone
 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; GCN: buffer_store_dword [[VRESULT]],
 ; GCN: s_endpgm
-define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
  %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
  %truncctpop = trunc i64 %ctpop to i32
  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
@ -31,7 +31,7 @@ define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
 ; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
  %val = load i64, i64 addrspace(1)* %in, align 8
  %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
  %truncctpop = trunc i64 %ctpop to i32
@ -48,7 +48,7 @@ define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noali
 ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind {
+define amdgpu_kernel void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind {
  %val = load i64, i64 addrspace(1)* %in, align 8
  %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
  %or = or i64 %ctpop, %s.val
@ -60,7 +60,7 @@ define void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)*
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_endpgm
-define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
+define amdgpu_kernel void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
  %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
  %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
  store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
@ -73,7 +73,7 @@ define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val)
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_endpgm
-define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
+define amdgpu_kernel void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
  %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
  %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
  store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
@ -86,7 +86,7 @@ define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val)
 ; GCN: v_bcnt_u32_b32
 ; GCN: v_bcnt_u32_b32
 ; GCN: s_endpgm
-define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
  %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
  %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
  %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
@ -104,7 +104,7 @@ define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrs
 ; GCN: v_bcnt_u32_b32
 ; GCN: v_bcnt_u32_b32
 ; GCN: s_endpgm
-define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
  %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
  %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
  %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
@ -121,7 +121,7 @@ define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrs
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[ZERO]]
 ; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}}
 ; GCN: s_endpgm
-define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
+define amdgpu_kernel void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
 entry:
  %tmp0 = icmp eq i32 %cond, 0
  br i1 %tmp0, label %if, label %else
@ -146,7 +146,7 @@ endif:
 ; GCN: s_bcnt1_i32_b64 [[SRESULT1:s[0-9]+]],
 ; GCN: s_add_i32 s{{[0-9]+}}, [[SRESULT1]], [[SRESULT0]]
 ; GCN: s_endpgm
-define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
  %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
  %truncctpop = trunc i128 %ctpop to i32
  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
@ -159,7 +159,7 @@ define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
 ; GCN: s_bcnt1_i32_b64 [[REG1:s[0-9]+]],
 ; GCN: s_add_i32 {{s[0-9]+}}, [[REG0]], [[REG1]]
 ; GCN: s_endpgm
-define void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {
  %ctpop = call i65 @llvm.ctpop.i65(i65 %val) nounwind readnone
  %truncctpop = trunc i65 %ctpop to i32
  store i32 %truncctpop, i32 addrspace(1)* %out, align 4
@ -181,7 +181,7 @@ define void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {

 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind {
  %val = load i128, i128 addrspace(1)* %in, align 8
  %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
  %truncctpop = trunc i128 %ctpop to i32
--- a/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@ -14,7 +14,7 @@ declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
 ; SI: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
  store i32 %cttz, i32 addrspace(1)* %out, align 4
  ret void
@ -27,7 +27,7 @@ define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nou
 ; SI: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
  %val = load i32, i32 addrspace(1)* %valptr, align 4
  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
  store i32 %cttz, i32 addrspace(1)* %out, align 4
@ -43,7 +43,7 @@ define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
  %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
  %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
  store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
@ -63,7 +63,7 @@ define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
  %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
  %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
  store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
--- a/test/CodeGen/AMDGPU/cube.ll
+++ b/test/CodeGen/AMDGPU/cube.ll
@ -12,7 +12,7 @@ declare float @llvm.amdgcn.cubema(float, float, float) #0
 ; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: _store_dwordx4
-define void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c) #1 {
  %cubeid = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
  %cubesc = call float @llvm.amdgcn.cubesc(float %a, float %b, float %c)
  %cubetc = call float @llvm.amdgcn.cubetc(float %a, float %b, float %c)
--- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
 ; GCN-NOT: lshr
 ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
 ; GCN: buffer_store_dword [[CONV]],
-define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
  %load = load i8, i8 addrspace(1)* %in, align 1
  %cvt = uitofp i8 %load to float
  store float %cvt, float addrspace(1)* %out, align 4
@ -22,7 +22,7 @@ define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* n
 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]]
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
-define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
  %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2
  %cvt = uitofp <2 x i8> %load to <2 x float>
  store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
@ -36,7 +36,7 @@ define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8>
 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
-define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
  %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
  %cvt = uitofp <3 x i8> %load to <3 x float>
  store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
@ -52,7 +52,7 @@ define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8>
 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]]
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
-define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
  %cvt = uitofp <4 x i8> %load to <4 x float>
  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
@ -76,7 +76,7 @@ define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8>
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]]

 ; GCN: buffer_store_dwordx4
-define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
  %cvt = uitofp <4 x i8> %load to <4 x float>
  store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
@ -110,7 +110,7 @@ define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out
 ; GCN: {{buffer|flat}}_store_dword

 ; GCN: s_endpgm
-define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
  %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
  %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
@ -124,7 +124,7 @@ define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <
 ; Make sure this doesn't crash.
 ; GCN-LABEL: {{^}}load_v7i8_to_v7f32:
 ; GCN: s_endpgm
-define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
  %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1
  %cvt = uitofp <7 x i8> %load to <7 x float>
  store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
@ -147,7 +147,7 @@ define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8>
 ; GCN-NOT: lshr
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
  %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
  %cvt = uitofp <8 x i8> %load to <8 x float>
  store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
@ -159,7 +159,7 @@ define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8>
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]]
 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
 ; GCN: buffer_store_dword [[CONV]],
-define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
  %load = load i32, i32 addrspace(1)* %in, align 4
  %add = add i32 %load, 2
  %inreg = and i32 %add, 255
@ -169,7 +169,7 @@ define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addr
 }

 ; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
-define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
  %load = load i32, i32 addrspace(1)* %in, align 4
  %inreg = and i32 %load, 65280
  %shr = lshr i32 %inreg, 8
@ -181,7 +181,7 @@ define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addr
 ; We don't get these ones because of the zext, but instcombine removes
 ; them so it shouldn't really matter.
 ; GCN-LABEL: {{^}}i8_zext_i32_to_f32:
-define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
  %load = load i8, i8 addrspace(1)* %in, align 1
  %ext = zext i8 %load to i32
  %cvt = uitofp i32 %ext to float
@ -190,7 +190,7 @@ define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1
 }

 ; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32:
-define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
  %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
  %ext = zext <4 x i8> %load to <4 x i32>
  %cvt = uitofp <4 x i32> %ext to <4 x float>
@ -203,7 +203,7 @@ define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
  %val = load i32, i32 addrspace(1)* %in
  %and = and i32 %val, 255
  %cvt = uitofp i32 %and to float
@ -216,7 +216,7 @@ define void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspac
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
  %val = load i32, i32 addrspace(1)* %in
  %srl = lshr i32 %val, 8
  %and = and i32 %srl, 255
@ -230,7 +230,7 @@ define void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspac
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
  %val = load i32, i32 addrspace(1)* %in
  %srl = lshr i32 %val, 16
  %and = and i32 %srl, 255
@ -244,7 +244,7 @@ define void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspac
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
  %val = load i32, i32 addrspace(1)* %in
  %srl = lshr i32 %val, 24
  %and = and i32 %srl, 255
--- a/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
+++ b/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
@ -10,7 +10,7 @@ declare float @llvm.floor.f32(float) #1
 ; SI-NOT: add
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
  %floor = call float @llvm.floor.f32(float %x) #1
  %cvt = fptosi float %floor to i32
  store i32 %cvt, i32 addrspace(1)* %out
@ -22,7 +22,7 @@ define void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, [[TMP]]
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 {
  %fadd = fadd float %x, 1.0
  %floor = call float @llvm.floor.f32(float %fadd) #1
  %cvt = fptosi float %floor to i32
@ -35,7 +35,7 @@ define void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
  %x.fabs = call float @llvm.fabs.f32(float %x) #1
  %floor = call float @llvm.floor.f32(float %x.fabs) #1
  %cvt = fptosi float %floor to i32
@ -48,7 +48,7 @@ define void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
  %x.fneg = fsub float -0.000000e+00, %x
  %floor = call float @llvm.floor.f32(float %x.fneg) #1
  %cvt = fptosi float %floor to i32
@ -61,7 +61,7 @@ define void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}|
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
  %x.fabs = call float @llvm.fabs.f32(float %x) #1
  %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
  %floor = call float @llvm.floor.f32(float %x.fabs.fneg) #1
@ -75,7 +75,7 @@ define void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
 ; SI: v_floor_f32
 ; SI: v_cvt_u32_f32_e32
 ; SI: s_endpgm
-define void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
  %floor = call float @llvm.floor.f32(float %x) #1
  %cvt = fptoui float %floor to i32
  store i32 %cvt, i32 addrspace(1)* %out
--- a/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
+++ b/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
@ -9,7 +9,7 @@ declare float @llvm.floor.f32(float) #1
 ; SI-SAFE-NOT: v_cvt_rpi_i32_f32
 ; SI-NONAN: v_cvt_rpi_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 {
  %fadd = fadd float %x, 0.5
  %floor = call float @llvm.floor.f32(float %fadd) #1
  %cvt = fptosi float %floor to i32
@ -21,7 +21,7 @@ define void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_rpi_i32_f32
 ; SI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
  %x.fabs = call float @llvm.fabs.f32(float %x) #1
  %fadd = fadd float %x.fabs, 0.5
  %floor = call float @llvm.floor.f32(float %fadd) #1
@ -37,7 +37,7 @@ define void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
  %x.fneg = fsub float -0.000000e+00, %x
  %fadd = fadd float %x.fneg, 0.5
  %floor = call float @llvm.floor.f32(float %fadd) #1
@ -55,7 +55,7 @@ define void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
  %x.fabs = call float @llvm.fabs.f32(float %x) #1
  %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
  %fadd = fadd float %x.fabs.fneg, 0.5
@ -71,7 +71,7 @@ define void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
 ; SI: v_floor_f32
 ; SI: v_cvt_u32_f32
 ; SI: s_endpgm
-define void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
  %fadd = fadd float %x, 0.5
  %floor = call float @llvm.floor.f32(float %fadd) #1
  %cvt = fptoui float %floor to i32
--- a/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
+++ b/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
@ -9,7 +9,7 @@
 ; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]]
 ; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]]

-define void @store_same_base_ptr(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @store_same_base_ptr(i32 addrspace(1)* %out) {
 entry:
  %id = call i32 @llvm.amdgcn.workitem.id.x() #0
  %offset = sext i32 %id to i64
--- a/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
+++ b/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
@ -10,7 +10,7 @@
 ; CHECK: {{^}}sint:
 ; CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}

-define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
  %sint = load i32, i32 addrspace(1) * %in
@ -24,7 +24,7 @@ entry:
 ;CHECK: {{^}}uint:
 ;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}

-define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
  %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
  %uint = load i32, i32 addrspace(1) * %in
--- a/test/CodeGen/AMDGPU/debug.ll
+++ b/test/CodeGen/AMDGPU/debug.ll
@ -4,7 +4,7 @@
 ; Test for a crash in the custom assembly dump code.

 ; SI: s_endpgm
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
  store i32 0, i32 addrspace(1)* %out
  ret void
 }
--- a/Show More
+++ b/Show More