llvm-mirror/test/CodeGen/AMDGPU/large-work-group-registers.ll

; RUN: llc -march=amdgcn -mcpu=tonga -regalloc=basic -post-RA-scheduler=0 < %s | FileCheck %s

; CHECK: NumVgprs: 64
define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, [16 x <8 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, <3 x i32> inreg, <3 x i32> inreg, <3 x i32>) #0 {
main_body:
  %8 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %4, i64 0, i64 8
  %9 = load <4 x i32>, <4 x i32> addrspace(2)* %8, align 16, !tbaa !0
  %10 = extractelement <3 x i32> %7, i32 0
  %11 = extractelement <3 x i32> %7, i32 1
  %12 = mul i32 %10, %11
  %bc = bitcast <3 x i32> %7 to <3 x float>
  %13 = extractelement <3 x float> %bc, i32 1
  %14 = insertelement <512 x float> undef, float %13, i32 %12
  call void @llvm.amdgcn.s.barrier()
  %15 = extractelement <3 x i32> %6, i32 0
  %16 = extractelement <3 x i32> %7, i32 0
  %17 = shl i32 %15, 5
  %18 = add i32 %17, %16
  %19 = shl i32 %18, 4
  %20 = extractelement <3 x i32> %7, i32 1
  %21 = shl i32 %20, 2
  %22 = sext i32 %21 to i64
  %23 = getelementptr i8, i8 addrspace(3)* null, i64 %22
  %24 = bitcast i8 addrspace(3)* %23 to i32 addrspace(3)*
  %25 = load i32, i32 addrspace(3)* %24, align 4
  %26 = extractelement <512 x float> %14, i32 %25
  %27 = insertelement <4 x float> undef, float %26, i32 0
  call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %27, <4 x i32> %9, i32 0, i32 %19, i1 false, i1 false)
  ret void
}

declare void @llvm.amdgcn.s.barrier() #1

declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2

attributes #0 = { "amdgpu-max-work-group-size"="1024" }
attributes #1 = { convergent nounwind }
attributes #2 = { nounwind }

!0 = !{!1, !1, i64 0, i32 1}
!1 = !{!"const", null}
AMDGPU: Fix a slow test by using basic regalloc This just tests that the register limit isn't exceeded, so the regisetr allocation doesn't need to be great.' The critically slow part is all in greedy RA, so switch to basic. llvm-svn: 277700 2016-08-04 09:04:54 +02:00			`; RUN: llc -march=amdgcn -mcpu=tonga -regalloc=basic -post-RA-scheduler=0 < %s \| FileCheck %s`
AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit Summary: For GL_ARB_compute_shader we need to support workgroup sizes of at least 1024. However, if we want to allow large workgroup sizes, we may need to use less registers, as we have to run more waves per SIMD. This patch adds an attribute to specify the maximum work group size the compiled program needs to support. It defaults, to 256, as that has no wave restrictions. Reducing the number of registers available is done similarly to how the registers were reserved for chips with the sgpr init bug. Reviewers: mareko, arsenm, tstellarAMD, nhaehnle Subscribers: FireBurn, kerberizer, llvm-commits, arsenm Differential Revision: http://reviews.llvm.org/D18340 Patch By: Bas Nieuwenhuizen llvm-svn: 266337 2016-04-14 18:27:07 +02:00
AMDGPU: Change private_element_size to 4 llvm-svn: 269145 2016-05-11 02:28:54 +02:00			`; CHECK: NumVgprs: 64`
AMDGPU: allow specifying a workgroup size that needs to fit in a compute unit Summary: For GL_ARB_compute_shader we need to support workgroup sizes of at least 1024. However, if we want to allow large workgroup sizes, we may need to use less registers, as we have to run more waves per SIMD. This patch adds an attribute to specify the maximum work group size the compiled program needs to support. It defaults, to 256, as that has no wave restrictions. Reducing the number of registers available is done similarly to how the registers were reserved for chips with the sgpr init bug. Reviewers: mareko, arsenm, tstellarAMD, nhaehnle Subscribers: FireBurn, kerberizer, llvm-commits, arsenm Differential Revision: http://reviews.llvm.org/D18340 Patch By: Bas Nieuwenhuizen llvm-svn: 266337 2016-04-14 18:27:07 +02:00			`define void @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, [16 x <8 x i32>] addrspace(2)* byval, [16 x <4 x i32>] addrspace(2)* byval, <3 x i32> inreg, <3 x i32> inreg, <3 x i32>) #0 {`
			`main_body:`
			`%8 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %4, i64 0, i64 8`
			`%9 = load <4 x i32>, <4 x i32> addrspace(2)* %8, align 16, !tbaa !0`
			`%10 = extractelement <3 x i32> %7, i32 0`
			`%11 = extractelement <3 x i32> %7, i32 1`
			`%12 = mul i32 %10, %11`
			`%bc = bitcast <3 x i32> %7 to <3 x float>`
			`%13 = extractelement <3 x float> %bc, i32 1`
			`%14 = insertelement <512 x float> undef, float %13, i32 %12`
			`call void @llvm.amdgcn.s.barrier()`
			`%15 = extractelement <3 x i32> %6, i32 0`
			`%16 = extractelement <3 x i32> %7, i32 0`
			`%17 = shl i32 %15, 5`
			`%18 = add i32 %17, %16`
			`%19 = shl i32 %18, 4`
			`%20 = extractelement <3 x i32> %7, i32 1`
			`%21 = shl i32 %20, 2`
			`%22 = sext i32 %21 to i64`
			`%23 = getelementptr i8, i8 addrspace(3)* null, i64 %22`
			`%24 = bitcast i8 addrspace(3)* %23 to i32 addrspace(3)*`
			`%25 = load i32, i32 addrspace(3)* %24, align 4`
			`%26 = extractelement <512 x float> %14, i32 %25`
			`%27 = insertelement <4 x float> undef, float %26, i32 0`
			`call void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float> %27, <4 x i32> %9, i32 0, i32 %19, i1 false, i1 false)`
			`ret void`
			`}`

			`declare void @llvm.amdgcn.s.barrier() #1`

			`declare void @llvm.amdgcn.buffer.store.format.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2`

			`attributes #0 = { "amdgpu-max-work-group-size"="1024" }`
			`attributes #1 = { convergent nounwind }`
			`attributes #2 = { nounwind }`

			`!0 = !{!1, !1, i64 0, i32 1}`
			`!1 = !{!"const", null}`