1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-26 04:32:44 +01:00

[AMDGPU] Increase alignment of LDS globals if necessary before LDS lowering.

Before packing LDS globals into a sorted structure, make sure that
their alignment is properly updated based on their size. This will make
sure that the members of sorted structure are properly aligned, and
hence it will further reduce the probability of unaligned LDS access.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D103261
This commit is contained in:
hsmahesha 2021-06-04 09:34:37 +05:30
parent ad23e89deb
commit fdb852810c
8 changed files with 281 additions and 71 deletions

View File

@ -172,6 +172,29 @@ private:
return false;
}
// Increase the alignment of LDS globals if necessary to maximise the chance
// that we can use aligned LDS instructions to access them.
for (auto *GV : FoundLocalVars) {
Align Alignment(GV->getAlignment());
TypeSize GVSize = DL.getTypeAllocSize(GV->getValueType());
if (GVSize > 8) {
// We might want to use a b96 or b128 load/store
Alignment = std::max(Alignment, Align(16));
} else if (GVSize > 4) {
// We might want to use a b64 load/store
Alignment = std::max(Alignment, Align(8));
} else if (GVSize > 2) {
// We might want to use a b32 load/store
Alignment = std::max(Alignment, Align(4));
} else if (GVSize > 1) {
// We might want to use a b16 load/store
Alignment = std::max(Alignment, Align(2));
}
GV->setAlignment(Alignment);
}
// Sort by alignment, descending, to minimise padding.
// On ties, sort by size, descending, then by name, lexicographical.
llvm::stable_sort(

View File

@ -9,7 +9,7 @@ define amdgpu_kernel void @use_lds_globals(i32 addrspace(1)* %out, i32 addrspace
; CHECK-LABEL: use_lds_globals:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
; CHECK-NEXT: v_mov_b32_e32 v0, 8
; CHECK-NEXT: v_mov_b32_e32 v0, 4
; CHECK-NEXT: s_mov_b32 m0, -1
; CHECK-NEXT: ds_read_b32 v3, v0
; CHECK-NEXT: v_mov_b32_e32 v2, 9
@ -19,7 +19,7 @@ define amdgpu_kernel void @use_lds_globals(i32 addrspace(1)* %out, i32 addrspace
; CHECK-NEXT: v_mov_b32_e32 v0, s0
; CHECK-NEXT: v_mov_b32_e32 v1, s1
; CHECK-NEXT: flat_store_dword v[0:1], v3
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_mov_b32_e32 v0, 0x200
; CHECK-NEXT: ds_write_b32 v0, v2
; CHECK-NEXT: s_endpgm
entry:

View File

@ -1007,10 +1007,9 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out
define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
; CI-LABEL: load_misaligned64_constant_offsets:
; CI: ; %bb.0:
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
; CI-NEXT: ds_read2_b32 v[2:3], v2 offset0:2 offset1:3
; CI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
@ -1023,8 +1022,7 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)*
; GFX9-ALIGNED-LABEL: load_misaligned64_constant_offsets:
; GFX9-ALIGNED: ; %bb.0:
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v4, 0
; GFX9-ALIGNED-NEXT: ds_read2_b32 v[0:1], v4 offset1:1
; GFX9-ALIGNED-NEXT: ds_read2_b32 v[2:3], v4 offset0:2 offset1:3
; GFX9-ALIGNED-NEXT: ds_read2_b64 v[0:3], v4 offset1:1
; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
@ -1035,7 +1033,7 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)*
; GFX9-UNALIGNED-LABEL: load_misaligned64_constant_offsets:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, 0
; GFX9-UNALIGNED-NEXT: ds_read2_b64 v[0:3], v4 offset1:1
; GFX9-UNALIGNED-NEXT: ds_read_b128 v[0:3], v4
; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
@ -1054,11 +1052,10 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)*
define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
; CI-LABEL: load_misaligned64_constant_large_offsets:
; CI: ; %bb.0:
; CI-NEXT: v_mov_b32_e32 v0, 0x4000
; CI-NEXT: v_mov_b32_e32 v2, 0x7ff8
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; CI-NEXT: ds_read2_b32 v[2:3], v2 offset1:1
; CI-NEXT: ds_read_b64 v[0:1], v2 offset:16384
; CI-NEXT: ds_read_b64 v[2:3], v2 offset:32760
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
@ -1070,12 +1067,10 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspac
;
; GFX9-LABEL: load_misaligned64_constant_large_offsets:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4000
; GFX9-NEXT: v_mov_b32_e32 v2, 0x7ff8
; GFX9-NEXT: ds_read2_b32 v[0:1], v0 offset1:1
; GFX9-NEXT: ds_read2_b32 v[2:3], v2 offset1:1
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: ds_read_b64 v[0:1], v4 offset:16384
; GFX9-NEXT: ds_read_b64 v[2:3], v4 offset:32760
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc

View File

@ -6,10 +6,10 @@
; offset0 is larger than offset1
; SI-LABEL: {{^}}offset_order:
; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset1:14{{$}}
; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1024
; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
; SI-DAG: ds_read_b64 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset:8
; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:12
; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:6 offset1:248
define amdgpu_kernel void @offset_order(float addrspace(1)* %out) {
entry:
%ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 0

View File

@ -820,33 +820,32 @@ define amdgpu_kernel void @store_constant_disjoint_offsets() {
define amdgpu_kernel void @store_misaligned64_constant_offsets() {
; CI-LABEL: store_misaligned64_constant_offsets:
; CI: ; %bb.0:
; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, 0x7b
; CI-NEXT: s_movk_i32 s0, 0x7b
; CI-NEXT: s_mov_b32 s1, 0
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write2_b32 v0, v1, v0 offset1:1
; CI-NEXT: ds_write2_b32 v0, v1, v0 offset0:2 offset1:3
; CI-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:1
; CI-NEXT: s_endpgm
;
; GFX9-ALIGNED-LABEL: store_misaligned64_constant_offsets:
; GFX9-ALIGNED: ; %bb.0:
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, 0
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v1, v0 offset1:1
; GFX9-ALIGNED-NEXT: ds_write2_b32 v0, v1, v0 offset0:2 offset1:3
; GFX9-ALIGNED-NEXT: s_movk_i32 s0, 0x7b
; GFX9-ALIGNED-NEXT: s_mov_b32 s1, 0
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s1
; GFX9-ALIGNED-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:1
; GFX9-ALIGNED-NEXT: s_endpgm
;
; GFX9-UNALIGNED-LABEL: store_misaligned64_constant_offsets:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: s_movk_i32 s0, 0x7b
; GFX9-UNALIGNED-NEXT: s_mov_b32 s1, 0
; GFX9-UNALIGNED-NEXT: s_mov_b32 s2, s0
; GFX9-UNALIGNED-NEXT: s_mov_b32 s3, s1
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s2
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, 0
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s3
; GFX9-UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, v0
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, v1
; GFX9-UNALIGNED-NEXT: ds_write_b128 v1, v[0:3]
; GFX9-UNALIGNED-NEXT: s_endpgm
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
@ -858,23 +857,25 @@ define amdgpu_kernel void @store_misaligned64_constant_offsets() {
define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
; CI-LABEL: store_misaligned64_constant_large_offsets:
; CI: ; %bb.0:
; CI-NEXT: v_mov_b32_e32 v0, 0x4000
; CI-NEXT: v_mov_b32_e32 v1, 0x7b
; CI-NEXT: s_movk_i32 s0, 0x7b
; CI-NEXT: s_mov_b32 s1, 0
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; CI-NEXT: v_mov_b32_e32 v0, 0x7ff8
; CI-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; CI-NEXT: ds_write_b64 v2, v[0:1] offset:16384
; CI-NEXT: ds_write_b64 v2, v[0:1] offset:32760
; CI-NEXT: s_endpgm
;
; GFX9-LABEL: store_misaligned64_constant_large_offsets:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0x4000
; GFX9-NEXT: v_mov_b32_e32 v1, 0x7b
; GFX9-NEXT: s_movk_i32 s0, 0x7b
; GFX9-NEXT: s_mov_b32 s1, 0
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; GFX9-NEXT: v_mov_b32_e32 v0, 0x7ff8
; GFX9-NEXT: ds_write2_b32 v0, v1, v2 offset1:1
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:16384
; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:32760
; GFX9-NEXT: s_endpgm
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4

View File

@ -46,9 +46,9 @@ define amdgpu_kernel void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace
ret void
}
; 38 + (2 pad) + 38
; 38 + (10 pad) + 38 (= 86)
; HSA-LABEL: {{^}}test_round_size_2_align_8:
; HSA: workgroup_group_segment_byte_size = 78
; HSA: workgroup_group_segment_byte_size = 86
; HSA: group_segment_alignment = 4
define amdgpu_kernel void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
%lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
@ -94,7 +94,6 @@ define amdgpu_kernel void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 add
ret void
}
; FIXME: missign alignment can be improved.
; (39 * 4) + (4 pad) + (7 * 8) = 216
; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0:
; HSA: workgroup_group_segment_byte_size = 216
@ -127,10 +126,10 @@ define amdgpu_kernel void @test_missing_alignment_size_2_order1(i8 addrspace(1)*
ret void
}
; align 32, 16, 8
; 38 + (10 pad) + 38 + (2 pad) + 38 = 126
; align 32, 16, 16
; 38 + (10 pad) + 38 + (10 pad) + 38 ( = 134)
; HSA-LABEL: {{^}}test_round_size_3_order0:
; HSA: workgroup_group_segment_byte_size = 126
; HSA: workgroup_group_segment_byte_size = 134
; HSA: group_segment_alignment = 4
define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
%lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
@ -148,10 +147,10 @@ define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 ad
ret void
}
; align 32, 8, 16
; 38 (+ 10 pad) + 38 + (2 pad) + 38 = 126
; align 32, 16, 16
; 38 (+ 10 pad) + 38 + (10 pad) + 38 ( = 134)
; HSA-LABEL: {{^}}test_round_size_3_order1:
; HSA: workgroup_group_segment_byte_size = 126
; HSA: workgroup_group_segment_byte_size = 134
; HSA: group_segment_alignment = 4
define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
%lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
@ -169,10 +168,10 @@ define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 ad
ret void
}
; align 16, 32, 8
; 38 + (10 pad) + 38 + (2 pad) + 38 = 126
; align 32, 16, 16
; 38 + (10 pad) + 38 + (10 pad) + 38 ( = 126)
; HSA-LABEL: {{^}}test_round_size_3_order2:
; HSA: workgroup_group_segment_byte_size = 126
; HSA: workgroup_group_segment_byte_size = 134
; HSA: group_segment_alignment = 4
define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
%lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
@ -190,11 +189,10 @@ define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 ad
ret void
}
; FIXME: Improve alignment
; align 16, 8, 32
; 38 + (10 pad) + 38 + (2 pad) + 38
; align 32, 16, 16
; 38 + (10 pad) + 38 + (10 pad) + 38 ( = 134)
; HSA-LABEL: {{^}}test_round_size_3_order3:
; HSA: workgroup_group_segment_byte_size = 126
; HSA: workgroup_group_segment_byte_size = 134
; HSA: group_segment_alignment = 4
define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
%lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
@ -212,10 +210,10 @@ define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 ad
ret void
}
; align 8, 32, 16
; 38 + (10 pad) + 38 + (2 pad) + 38 = 126
; align 32, 16, 16
; 38 + (10 pad) + 38 + (10 pad) + 38 (= 134)
; HSA-LABEL: {{^}}test_round_size_3_order4:
; HSA: workgroup_group_segment_byte_size = 126
; HSA: workgroup_group_segment_byte_size = 134
; HSA: group_segment_alignment = 4
define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
%lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
@ -233,10 +231,10 @@ define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 ad
ret void
}
; align 8, 16, 32
; 38 + (10 pad) + 38 + (2 pad) + 38 = 126
; align 32, 16, 16
; 38 + (10 pad) + 38 + (10 pad) + 38 (= 134)
; HSA-LABEL: {{^}}test_round_size_3_order5:
; HSA: workgroup_group_segment_byte_size = 126
; HSA: workgroup_group_segment_byte_size = 134
; HSA: group_segment_alignment = 4
define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
%lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*

View File

@ -8,7 +8,7 @@
; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
; IR: alloca [10 x i32]
; ASM-LABEL: {{^}}promote_alloca_size_256:
; ASM: .amdgpu_lds llvm.amdgcn.kernel.promote_alloca_size_256.lds, 60000, 4
; ASM: .amdgpu_lds llvm.amdgcn.kernel.promote_alloca_size_256.lds, 60000, 16
; ASM-NOT: .amdgpu_lds
define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {

View File

@ -0,0 +1,193 @@
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s | FileCheck %s
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
; Properly aligned, same size as alignment.
; CHECK: %llvm.amdgcn.kernel.k0.lds.t = type { [16 x i8], [8 x i8], [4 x i8], [2 x i8], [1 x i8] }
; Different properly aligned values, but same size of 1.
; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [1 x i8], [7 x i8], [1 x i8], [3 x i8], [1 x i8], [1 x i8], [1 x i8], [1 x i8] }
; All are under-aligned, requires to fix each on different alignment boundary.
; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { [9 x i8], [7 x i8], [5 x i8], [3 x i8], [3 x i8], [1 x i8], [2 x i8] }
; All LDS are underaligned, requires to allocate on 8 byte boundary
; CHECK: %llvm.amdgcn.kernel.k3.lds.t = type { [7 x i8], [1 x i8], [7 x i8], [1 x i8], [6 x i8], [2 x i8], [5 x i8] }
; All LDS are underaligned, requires to allocate on 16 byte boundary
; CHECK: %llvm.amdgcn.kernel.k4.lds.t = type { [12 x i8], [4 x i8], [11 x i8], [5 x i8], [10 x i8], [6 x i8], [9 x i8] }
; All LDS are properly aligned on 16 byte boundary, but they are of different size.
; CHECK: %llvm.amdgcn.kernel.k5.lds.t = type { [20 x i8], [12 x i8], [19 x i8], [13 x i8], [18 x i8], [14 x i8], [17 x i8] }
; CHECK: @llvm.amdgcn.kernel.k0.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k0.lds.t undef, align 16
; CHECK: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 16
; CHECK: @llvm.amdgcn.kernel.k2.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k2.lds.t undef, align 16
; CHECK: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 8
; CHECK: @llvm.amdgcn.kernel.k4.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k4.lds.t undef, align 16
; CHECK: @llvm.amdgcn.kernel.k5.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k5.lds.t undef, align 16
; Properly aligned, same size as alignment.
; CHECK-NOT: @k0.lds.size.1.align.1
; CHECK-NOT: @k0.lds.size.2.align.2
; CHECK-NOT: @k0.lds.size.4.align.4
; CHECK-NOT: @k0.lds.size.8.align.8
; CHECK-NOT: @k0.lds.size.16.align.16
@k0.lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
@k0.lds.size.2.align.2 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 2
@k0.lds.size.4.align.4 = internal unnamed_addr addrspace(3) global [4 x i8] undef, align 4
@k0.lds.size.8.align.8 = internal unnamed_addr addrspace(3) global [8 x i8] undef, align 8
@k0.lds.size.16.align.16 = internal unnamed_addr addrspace(3) global [16 x i8] undef, align 16
define amdgpu_kernel void @k0() {
%k0.lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @k0.lds.size.1.align.1 to i8 addrspace(3)*
store i8 1, i8 addrspace(3)* %k0.lds.size.1.align.1.bc, align 1
%k0.lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* @k0.lds.size.2.align.2 to i8 addrspace(3)*
store i8 2, i8 addrspace(3)* %k0.lds.size.2.align.2.bc, align 2
%k0.lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* @k0.lds.size.4.align.4 to i8 addrspace(3)*
store i8 3, i8 addrspace(3)* %k0.lds.size.4.align.4.bc, align 4
%k0.lds.size.8.align.8.bc = bitcast [8 x i8] addrspace(3)* @k0.lds.size.8.align.8 to i8 addrspace(3)*
store i8 4, i8 addrspace(3)* %k0.lds.size.8.align.8.bc, align 8
%k0.lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* @k0.lds.size.16.align.16 to i8 addrspace(3)*
store i8 5, i8 addrspace(3)* %k0.lds.size.16.align.16.bc, align 16
ret void
}
; Different properly aligned values, but same size of 1.
; CHECK-NOT: @k1.lds.size.1.align.1
; CHECK-NOT: @k1.lds.size.1.align.2
; CHECK-NOT: @k1.lds.size.1.align.4
; CHECK-NOT: @k1.lds.size.1.align.8
; CHECK-NOT: @k1.lds.size.1.align.16
@k1.lds.size.1.align.1 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 1
@k1.lds.size.1.align.2 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 2
@k1.lds.size.1.align.4 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 4
@k1.lds.size.1.align.8 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 8
@k1.lds.size.1.align.16 = internal unnamed_addr addrspace(3) global [1 x i8] undef, align 16
define amdgpu_kernel void @k1() {
%k1.lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @k1.lds.size.1.align.1 to i8 addrspace(3)*
store i8 1, i8 addrspace(3)* %k1.lds.size.1.align.1.bc, align 1
%k1.lds.size.1.align.2.bc = bitcast [1 x i8] addrspace(3)* @k1.lds.size.1.align.2 to i8 addrspace(3)*
store i8 2, i8 addrspace(3)* %k1.lds.size.1.align.2.bc, align 2
%k1.lds.size.1.align.4.bc = bitcast [1 x i8] addrspace(3)* @k1.lds.size.1.align.4 to i8 addrspace(3)*
store i8 3, i8 addrspace(3)* %k1.lds.size.1.align.4.bc, align 4
%k1.lds.size.1.align.8.bc = bitcast [1 x i8] addrspace(3)* @k1.lds.size.1.align.8 to i8 addrspace(3)*
store i8 4, i8 addrspace(3)* %k1.lds.size.1.align.8.bc, align 8
%k1.lds.size.1.align.16.bc = bitcast [1 x i8] addrspace(3)* @k1.lds.size.1.align.16 to i8 addrspace(3)*
store i8 5, i8 addrspace(3)* %k1.lds.size.1.align.16.bc, align 16
ret void
}
; All are under-aligned, requires to fix each on different alignment boundary.
; CHECK-NOT: @k2.lds.size.2.align.1
; CHECK-NOT: @k2.lds.size.3.align.2
; CHECK-NOT: @k2.lds.size.5.align.4
; CHECK-NOT: @k2.lds.size.9.align.8
@k2.lds.size.2.align.1 = internal unnamed_addr addrspace(3) global [2 x i8] undef, align 1
@k2.lds.size.3.align.2 = internal unnamed_addr addrspace(3) global [3 x i8] undef, align 2
@k2.lds.size.5.align.4 = internal unnamed_addr addrspace(3) global [5 x i8] undef, align 4
@k2.lds.size.9.align.8 = internal unnamed_addr addrspace(3) global [9 x i8] undef, align 8
define amdgpu_kernel void @k2() {
%k2.lds.size.2.align.1.bc = bitcast [2 x i8] addrspace(3)* @k2.lds.size.2.align.1 to i8 addrspace(3)*
store i8 1, i8 addrspace(3)* %k2.lds.size.2.align.1.bc, align 1
%k2.lds.size.3.align.2.bc = bitcast [3 x i8] addrspace(3)* @k2.lds.size.3.align.2 to i8 addrspace(3)*
store i8 2, i8 addrspace(3)* %k2.lds.size.3.align.2.bc, align 2
%k2.lds.size.5.align.4.bc = bitcast [5 x i8] addrspace(3)* @k2.lds.size.5.align.4 to i8 addrspace(3)*
store i8 3, i8 addrspace(3)* %k2.lds.size.5.align.4.bc, align 4
%k2.lds.size.9.align.8.bc = bitcast [9 x i8] addrspace(3)* @k2.lds.size.9.align.8 to i8 addrspace(3)*
store i8 4, i8 addrspace(3)* %k2.lds.size.9.align.8.bc, align 8
ret void
}
; All LDS are underaligned, requires to allocate on 8 byte boundary
; CHECK-NOT: @k3.lds.size.5.align.2
; CHECK-NOT: @k3.lds.size.6.align.2
; CHECK-NOT: @k3.lds.size.7.align.2
; CHECK-NOT: @k3.lds.size.7.align.4
@k3.lds.size.5.align.2 = internal unnamed_addr addrspace(3) global [5 x i8] undef, align 2
@k3.lds.size.6.align.2 = internal unnamed_addr addrspace(3) global [6 x i8] undef, align 2
@k3.lds.size.7.align.2 = internal unnamed_addr addrspace(3) global [7 x i8] undef, align 2
@k3.lds.size.7.align.4 = internal unnamed_addr addrspace(3) global [7 x i8] undef, align 4
define amdgpu_kernel void @k3() {
%k3.lds.size.5.align.2.bc = bitcast [5 x i8] addrspace(3)* @k3.lds.size.5.align.2 to i8 addrspace(3)*
store i8 1, i8 addrspace(3)* %k3.lds.size.5.align.2.bc, align 2
%k3.lds.size.6.align.2.bc = bitcast [6 x i8] addrspace(3)* @k3.lds.size.6.align.2 to i8 addrspace(3)*
store i8 2, i8 addrspace(3)* %k3.lds.size.6.align.2.bc, align 2
%k3.lds.size.7.align.2.bc = bitcast [7 x i8] addrspace(3)* @k3.lds.size.7.align.2 to i8 addrspace(3)*
store i8 3, i8 addrspace(3)* %k3.lds.size.7.align.2.bc, align 2
%k3.lds.size.7.align.4.bc = bitcast [7 x i8] addrspace(3)* @k3.lds.size.7.align.4 to i8 addrspace(3)*
store i8 4, i8 addrspace(3)* %k3.lds.size.7.align.4.bc, align 4
ret void
}
; All LDS are underaligned, requires to allocate on 16 byte boundary
; CHECK-NOT: @k4.lds.size.9.align.1
; CHECK-NOT: @k4.lds.size.10.align.2
; CHECK-NOT: @k4.lds.size.11.align.4
; CHECK-NOT: @k4.lds.size.12.align.8
@k4.lds.size.9.align.1 = internal unnamed_addr addrspace(3) global [9 x i8] undef, align 1
@k4.lds.size.10.align.2 = internal unnamed_addr addrspace(3) global [10 x i8] undef, align 2
@k4.lds.size.11.align.4 = internal unnamed_addr addrspace(3) global [11 x i8] undef, align 4
@k4.lds.size.12.align.8 = internal unnamed_addr addrspace(3) global [12 x i8] undef, align 8
define amdgpu_kernel void @k4() {
%k4.lds.size.9.align.1.bc = bitcast [9 x i8] addrspace(3)* @k4.lds.size.9.align.1 to i8 addrspace(3)*
store i8 1, i8 addrspace(3)* %k4.lds.size.9.align.1.bc, align 1
%k4.lds.size.10.align.2.bc = bitcast [10 x i8] addrspace(3)* @k4.lds.size.10.align.2 to i8 addrspace(3)*
store i8 2, i8 addrspace(3)* %k4.lds.size.10.align.2.bc, align 2
%k4.lds.size.11.align.4.bc = bitcast [11 x i8] addrspace(3)* @k4.lds.size.11.align.4 to i8 addrspace(3)*
store i8 3, i8 addrspace(3)* %k4.lds.size.11.align.4.bc, align 4
%k4.lds.size.12.align.8.bc = bitcast [12 x i8] addrspace(3)* @k4.lds.size.12.align.8 to i8 addrspace(3)*
store i8 4, i8 addrspace(3)* %k4.lds.size.12.align.8.bc, align 8
ret void
}
; CHECK-NOT: @k5.lds.size.17.align.16
; CHECK-NOT: @k5.lds.size.18.align.16
; CHECK-NOT: @k5.lds.size.19.align.16
; CHECK-NOT: @k5.lds.size.20.align.16
@k5.lds.size.17.align.16 = internal unnamed_addr addrspace(3) global [17 x i8] undef, align 16
@k5.lds.size.18.align.16 = internal unnamed_addr addrspace(3) global [18 x i8] undef, align 16
@k5.lds.size.19.align.16 = internal unnamed_addr addrspace(3) global [19 x i8] undef, align 16
@k5.lds.size.20.align.16 = internal unnamed_addr addrspace(3) global [20 x i8] undef, align 16
define amdgpu_kernel void @k5() {
%k5.lds.size.17.align.16.bc = bitcast [17 x i8] addrspace(3)* @k5.lds.size.17.align.16 to i8 addrspace(3)*
store i8 1, i8 addrspace(3)* %k5.lds.size.17.align.16.bc, align 16
%k5.lds.size.18.align.16.bc = bitcast [18 x i8] addrspace(3)* @k5.lds.size.18.align.16 to i8 addrspace(3)*
store i8 2, i8 addrspace(3)* %k5.lds.size.18.align.16.bc, align 16
%k5.lds.size.19.align.16.bc = bitcast [19 x i8] addrspace(3)* @k5.lds.size.19.align.16 to i8 addrspace(3)*
store i8 3, i8 addrspace(3)* %k5.lds.size.19.align.16.bc, align 16
%k5.lds.size.20.align.16.bc = bitcast [20 x i8] addrspace(3)* @k5.lds.size.20.align.16 to i8 addrspace(3)*
store i8 4, i8 addrspace(3)* %k5.lds.size.20.align.16.bc, align 16
ret void
}