1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 12:41:49 +01:00

[AMDGPU] Propagate LDS align into to instructions

Differential Revision: https://reviews.llvm.org/D104316
This commit is contained in:
Stanislav Mekhanoshin 2021-06-14 17:01:54 -07:00
parent 66df23eddc
commit 5459f63eb4
11 changed files with 197 additions and 71 deletions

View File

@ -309,6 +309,10 @@ private:
UsedList.erase(GV);
GV->eraseFromParent();
}
uint64_t Off = DL.getStructLayout(LDSTy)->getElementOffset(I);
Align A = commonAlignment(StructAlign, Off);
refineUsesAlignment(GEP, A, DL);
}
// Mark kernels with asm that reads the address of the allocated structure
@ -328,6 +332,46 @@ private:
}
return true;
}
void refineUsesAlignment(Value *Ptr, Align A, const DataLayout &DL,
unsigned MaxDepth = 5) {
if (!MaxDepth)
return;
for (User *U : Ptr->users()) {
if (auto *LI = dyn_cast<LoadInst>(U)) {
LI->setAlignment(std::max(A, LI->getAlign()));
continue;
}
if (auto *SI = dyn_cast<StoreInst>(U)) {
SI->setAlignment(std::max(A, SI->getAlign()));
continue;
}
if (auto *AI = dyn_cast<AtomicRMWInst>(U)) {
AI->setAlignment(std::max(A, AI->getAlign()));
continue;
}
if (auto *AI = dyn_cast<AtomicCmpXchgInst>(U)) {
AI->setAlignment(std::max(A, AI->getAlign()));
continue;
}
if (auto *GEP = dyn_cast<GetElementPtrInst>(U)) {
unsigned BitWidth = DL.getIndexTypeSizeInBits(GEP->getType());
APInt Off(BitWidth, 0);
if (GEP->getPointerOperand() == Ptr &&
GEP->accumulateConstantOffset(DL, Off)) {
Align GA = commonAlignment(A, Off.getLimitedValue());
refineUsesAlignment(GEP, GA, DL, MaxDepth - 1);
}
continue;
}
if (auto *I = dyn_cast<Instruction>(U)) {
if (I->getOpcode() == Instruction::BitCast ||
I->getOpcode() == Instruction::AddrSpaceCast)
refineUsesAlignment(I, A, DL, MaxDepth - 1);
}
}
}
};
} // namespace

View File

@ -1009,7 +1009,7 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)*
; CI: ; %bb.0:
; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_read2_b64 v[0:3], v0 offset1:1
; CI-NEXT: ds_read_b128 v[0:3], v0
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
@ -1019,27 +1019,16 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)*
; CI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; CI-NEXT: s_endpgm
;
; GFX9-ALIGNED-LABEL: load_misaligned64_constant_offsets:
; GFX9-ALIGNED: ; %bb.0:
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v4, 0
; GFX9-ALIGNED-NEXT: ds_read2_b64 v[0:3], v4 offset1:1
; GFX9-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-ALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-ALIGNED-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX9-ALIGNED-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX9-ALIGNED-NEXT: s_endpgm
;
; GFX9-UNALIGNED-LABEL: load_misaligned64_constant_offsets:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v4, 0
; GFX9-UNALIGNED-NEXT: ds_read_b128 v[0:3], v4
; GFX9-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-UNALIGNED-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-UNALIGNED-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX9-UNALIGNED-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX9-UNALIGNED-NEXT: s_endpgm
; GFX9-LABEL: load_misaligned64_constant_offsets:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v4, 0
; GFX9-NEXT: ds_read_b128 v[0:3], v4
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; GFX9-NEXT: s_endpgm
%val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
%val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
%sum = add i64 %val0, %val1

View File

@ -818,33 +818,22 @@ define amdgpu_kernel void @store_constant_disjoint_offsets() {
define amdgpu_kernel void @store_misaligned64_constant_offsets() {
; CI-LABEL: store_misaligned64_constant_offsets:
; CI: ; %bb.0:
; CI-NEXT: s_movk_i32 s0, 0x7b
; CI-NEXT: s_mov_b32 s1, 0
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v2, 0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v0, 0x7b
; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, v0
; CI-NEXT: v_mov_b32_e32 v3, v1
; CI-NEXT: s_mov_b32 m0, -1
; CI-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:1
; CI-NEXT: ds_write_b128 v1, v[0:3]
; CI-NEXT: s_endpgm
;
; GFX9-ALIGNED-LABEL: store_misaligned64_constant_offsets:
; GFX9-ALIGNED: ; %bb.0:
; GFX9-ALIGNED-NEXT: s_movk_i32 s0, 0x7b
; GFX9-ALIGNED-NEXT: s_mov_b32 s1, 0
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v2, 0
; GFX9-ALIGNED-NEXT: v_mov_b32_e32 v1, s1
; GFX9-ALIGNED-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:1
; GFX9-ALIGNED-NEXT: s_endpgm
;
; GFX9-UNALIGNED-LABEL: store_misaligned64_constant_offsets:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, 0
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, v0
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, v1
; GFX9-UNALIGNED-NEXT: ds_write_b128 v1, v[0:3]
; GFX9-UNALIGNED-NEXT: s_endpgm
; GFX9-LABEL: store_misaligned64_constant_offsets:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: v_mov_b32_e32 v2, v0
; GFX9-NEXT: v_mov_b32_e32 v3, v1
; GFX9-NEXT: ds_write_b128 v1, v[0:3]
; GFX9-NEXT: s_endpgm
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
ret void

View File

@ -25,11 +25,11 @@ define amdgpu_kernel void @k0() {
; CHECK-LABEL: @k0(
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
; CHECK-NEXT: %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i8 addrspace(3)*
; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 8
; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2) to i8 addrspace(3)*
; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4
; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1) to i8 addrspace(3)*
; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 16
; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)*
; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
; CHECK-NEXT: ret void
@ -53,9 +53,9 @@ define amdgpu_kernel void @k1() {
; CHECK-LABEL: @k1(
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2) to i8 addrspace(3)*
; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4
; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i8 addrspace(3)*
; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 16
; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i8 addrspace(3)*
; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
; CHECK-NEXT: ret void
@ -101,9 +101,9 @@ define amdgpu_kernel void @1() {
define void @f0() {
; CHECK-LABEL: @f0(
; CHECK-NEXT: %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1) to i8 addrspace(3)*
; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 8
; CHECK-NEXT: %lds.size.8.align.8.bc = bitcast [8 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0) to i8 addrspace(3)*
; CHECK-NEXT: store i8 8, i8 addrspace(3)* %lds.size.8.align.8.bc, align 4
; CHECK-NEXT: store i8 8, i8 addrspace(3)* %lds.size.8.align.8.bc, align 8
; CHECK-NEXT: ret void
;
%lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* @lds.size.1.align.1 to i8 addrspace(3)*

View File

@ -76,7 +76,7 @@ define amdgpu_kernel void @k3(i64 %x) {
; CHECK-NEXT: %3 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k3.lds.t, %llvm.amdgcn.kernel.k3.lds.t addrspace(3)* @llvm.amdgcn.kernel.k3.lds, i32 0, i32 0), i32 0, i32 24
; CHECK-NEXT: %4 = bitcast i8 addrspace(3)* %3 to i64 addrspace(3)*
; CHECK-NEXT: %ptr2 = addrspacecast i64 addrspace(3)* %4 to i64*
; CHECK-NEXT: store i64 2, i64* %ptr2, align 1
; CHECK-NEXT: store i64 2, i64* %ptr2, align 8
; CHECK-NEXT: ret void
;
%ptr1 = addrspacecast i64 addrspace(3)* bitcast (i8 addrspace(3)* getelementptr inbounds ([32 x i8], [32 x i8] addrspace(3)* @lds.3, i32 0, i32 16) to i64 addrspace(3)*) to i64*

View File

@ -41,7 +41,7 @@
@llvm.compiler.used = appending global [3 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.2 to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64* addrspace(1)* @gptr.4 to i8 addrspace(1)*) to i8*), i8* addrspacecast (i8 addrspace(3)* bitcast (i32 addrspace(3)* @lds.6 to i8 addrspace(3)*) to i8*)], section "llvm.metadata"
; CHECK-LABEL: @k0()
; CHECK: %ld.lds.1 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1), align 2
; CHECK: %ld.lds.1 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1), align 4
; CHECK: %ld.lds.2 = load i32, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0), align 4
; CHECK: %ld.lds.3 = load i64, i64 addrspace(3)* @lds.3, align 4
; CHECK: %ld.lds.4 = load float, float addrspace(3)* @lds.4, align 4

View File

@ -4,6 +4,8 @@
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-super-align-lds-globals=false < %s | FileCheck --check-prefixes=CHECK,SUPER-ALIGN_OFF %s
; CHECK: %llvm.amdgcn.kernel.k1.lds.t = type { [32 x i8] }
; CHECK: %llvm.amdgcn.kernel.k2.lds.t = type { i16, [2 x i8], i16 }
; CHECK: %llvm.amdgcn.kernel.k3.lds.t = type { [32 x i64], [32 x i32] }
; CHECK-NOT: @lds.1
@lds.1 = internal unnamed_addr addrspace(3) global [32 x i8] undef, align 1
@ -11,6 +13,10 @@
; SUPER-ALIGN_ON: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 16
; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k1.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k1.lds.t undef, align 1
; CHECK: @llvm.amdgcn.kernel.k2.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k2.lds.t undef, align 4
; SUPER-ALIGN_ON: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 16
; SUPER-ALIGN_OFF: @llvm.amdgcn.kernel.k3.lds = internal addrspace(3) global %llvm.amdgcn.kernel.k3.lds.t undef, align 8
; CHECK-LABEL: @k1
; CHECK: %1 = getelementptr inbounds [32 x i8], [32 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0), i32 0, i32 0
; CHECK: %2 = addrspacecast i8 addrspace(3)* %1 to i8*
@ -21,3 +27,103 @@ define amdgpu_kernel void @k1(i64 %x) {
store i8 1, i8 addrspace(0)* %ptr, align 1
ret void
}
@lds.2 = internal unnamed_addr addrspace(3) global i16 undef, align 4
@lds.3 = internal unnamed_addr addrspace(3) global i16 undef, align 4
; Check that alignment is propagated to uses for scalar variables.
; CHECK-LABEL: @k2
; CHECK: store i16 1, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k2.lds.t, %llvm.amdgcn.kernel.k2.lds.t addrspace(3)* @llvm.amdgcn.kernel.k2.lds, i32 0, i32 0), align 4
; CHECK: store i16 2, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k2.lds.t, %llvm.amdgcn.kernel.k2.lds.t addrspace(3)* @llvm.amdgcn.kernel.k2.lds, i32 0, i32 2), align 4
define amdgpu_kernel void @k2() {
store i16 1, i16 addrspace(3)* @lds.2, align 2
store i16 2, i16 addrspace(3)* @lds.3, align 2
ret void
}
@lds.4 = internal unnamed_addr addrspace(3) global [32 x i64] undef, align 8
@lds.5 = internal unnamed_addr addrspace(3) global [32 x i32] undef, align 4
; Check that alignment is propagated to uses for arrays.
; CHECK-LABEL: @k3
; CHECK: store i32 1, i32 addrspace(3)* %ptr1, align 8
; CHECK: store i32 2, i32 addrspace(3)* %ptr2, align 4
; SUPER-ALIGN_ON: store i32 3, i32 addrspace(3)* %ptr3, align 16
; SUPER-ALIGN_OFF: store i32 3, i32 addrspace(3)* %ptr3, align 8
; CHECK: store i32 4, i32 addrspace(3)* %ptr4, align 4
; CHECK: store i32 5, i32 addrspace(3)* %ptr5, align 4
; CHECK: %load1 = load i32, i32 addrspace(3)* %ptr1, align 8
; CHECK: %load2 = load i32, i32 addrspace(3)* %ptr2, align 4
; SUPER-ALIGN_ON: %load3 = load i32, i32 addrspace(3)* %ptr3, align 16
; SUPER-ALIGN_OFF: %load3 = load i32, i32 addrspace(3)* %ptr3, align 8
; CHECK: %load4 = load i32, i32 addrspace(3)* %ptr4, align 4
; CHECK: %load5 = load i32, i32 addrspace(3)* %ptr5, align 4
; CHECK: %val1 = atomicrmw volatile add i32 addrspace(3)* %ptr1, i32 1 monotonic, align 8
; CHECK: %val2 = cmpxchg volatile i32 addrspace(3)* %ptr1, i32 1, i32 2 monotonic monotonic, align 8
; CHECK: %ptr1.bc = bitcast i32 addrspace(3)* %ptr1 to i16 addrspace(3)*
; CHECK: %ptr2.bc = bitcast i32 addrspace(3)* %ptr2 to i16 addrspace(3)*
; CHECK: %ptr3.bc = bitcast i32 addrspace(3)* %ptr3 to i16 addrspace(3)*
; CHECK: %ptr4.bc = bitcast i32 addrspace(3)* %ptr4 to i16 addrspace(3)*
; CHECK: store i16 11, i16 addrspace(3)* %ptr1.bc, align 8
; CHECK: store i16 12, i16 addrspace(3)* %ptr2.bc, align 4
; SUPER-ALIGN_ON: store i16 13, i16 addrspace(3)* %ptr3.bc, align 16
; SUPER-ALIGN_OFF: store i16 13, i16 addrspace(3)* %ptr3.bc, align 8
; CHECK: store i16 14, i16 addrspace(3)* %ptr4.bc, align 4
; CHECK: %ptr1.ac = addrspacecast i32 addrspace(3)* %ptr1 to i32*
; CHECK: %ptr2.ac = addrspacecast i32 addrspace(3)* %ptr2 to i32*
; CHECK: %ptr3.ac = addrspacecast i32 addrspace(3)* %ptr3 to i32*
; CHECK: %ptr4.ac = addrspacecast i32 addrspace(3)* %ptr4 to i32*
; CHECK: store i32 21, i32* %ptr1.ac, align 8
; CHECK: store i32 22, i32* %ptr2.ac, align 4
; SUPER-ALIGN_ON: store i32 23, i32* %ptr3.ac, align 16
; SUPER-ALIGN_OFF: store i32 23, i32* %ptr3.ac, align 8
; CHECK: store i32 24, i32* %ptr4.ac, align 4
define amdgpu_kernel void @k3(i64 %x) {
%ptr0 = getelementptr inbounds i64, i64 addrspace(3)* bitcast ([32 x i64] addrspace(3)* @lds.4 to i64 addrspace(3)*), i64 0
store i64 0, i64 addrspace(3)* %ptr0, align 8
%ptr1 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 2
%ptr2 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 3
%ptr3 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 4
%ptr4 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 5
%ptr5 = getelementptr inbounds i32, i32 addrspace(3)* bitcast ([32 x i32] addrspace(3)* @lds.5 to i32 addrspace(3)*), i64 %x
store i32 1, i32 addrspace(3)* %ptr1, align 4
store i32 2, i32 addrspace(3)* %ptr2, align 4
store i32 3, i32 addrspace(3)* %ptr3, align 4
store i32 4, i32 addrspace(3)* %ptr4, align 4
store i32 5, i32 addrspace(3)* %ptr5, align 4
%load1 = load i32, i32 addrspace(3)* %ptr1, align 4
%load2 = load i32, i32 addrspace(3)* %ptr2, align 4
%load3 = load i32, i32 addrspace(3)* %ptr3, align 4
%load4 = load i32, i32 addrspace(3)* %ptr4, align 4
%load5 = load i32, i32 addrspace(3)* %ptr5, align 4
%val1 = atomicrmw volatile add i32 addrspace(3)* %ptr1, i32 1 monotonic, align 4
%val2 = cmpxchg volatile i32 addrspace(3)* %ptr1, i32 1, i32 2 monotonic monotonic, align 4
%ptr1.bc = bitcast i32 addrspace(3)* %ptr1 to i16 addrspace(3)*
%ptr2.bc = bitcast i32 addrspace(3)* %ptr2 to i16 addrspace(3)*
%ptr3.bc = bitcast i32 addrspace(3)* %ptr3 to i16 addrspace(3)*
%ptr4.bc = bitcast i32 addrspace(3)* %ptr4 to i16 addrspace(3)*
store i16 11, i16 addrspace(3)* %ptr1.bc, align 2
store i16 12, i16 addrspace(3)* %ptr2.bc, align 2
store i16 13, i16 addrspace(3)* %ptr3.bc, align 2
store i16 14, i16 addrspace(3)* %ptr4.bc, align 2
%ptr1.ac = addrspacecast i32 addrspace(3)* %ptr1 to i32*
%ptr2.ac = addrspacecast i32 addrspace(3)* %ptr2 to i32*
%ptr3.ac = addrspacecast i32 addrspace(3)* %ptr3 to i32*
%ptr4.ac = addrspacecast i32 addrspace(3)* %ptr4 to i32*
store i32 21, i32* %ptr1.ac, align 4
store i32 22, i32* %ptr2.ac, align 4
store i32 23, i32* %ptr3.ac, align 4
store i32 24, i32* %ptr4.ac, align 4
ret void
}

View File

@ -18,11 +18,11 @@
define amdgpu_kernel void @k0() {
; CHECK-LABEL: @k0(
; CHECK-NEXT: %lds.size.1.align.1.bc = bitcast [1 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 3) to i8 addrspace(3)*
; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 1
; CHECK-NEXT: store i8 1, i8 addrspace(3)* %lds.size.1.align.1.bc, align 2
; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 2) to i8 addrspace(3)*
; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4
; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 1) to i8 addrspace(3)*
; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 16
; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k0.lds.t, %llvm.amdgcn.kernel.k0.lds.t addrspace(3)* @llvm.amdgcn.kernel.k0.lds, i32 0, i32 0) to i8 addrspace(3)*
; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
; CHECK-NEXT: ret void
@ -45,9 +45,9 @@ define amdgpu_kernel void @k0() {
define amdgpu_kernel void @k1() {
; CHECK-LABEL: @k1(
; CHECK-NEXT: %lds.size.2.align.2.bc = bitcast [2 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 2) to i8 addrspace(3)*
; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 2
; CHECK-NEXT: store i8 2, i8 addrspace(3)* %lds.size.2.align.2.bc, align 4
; CHECK-NEXT: %lds.size.4.align.4.bc = bitcast [4 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i8 addrspace(3)*
; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 4
; CHECK-NEXT: store i8 4, i8 addrspace(3)* %lds.size.4.align.4.bc, align 16
; CHECK-NEXT: %lds.size.16.align.16.bc = bitcast [16 x i8] addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i8 addrspace(3)*
; CHECK-NEXT: store i8 16, i8 addrspace(3)* %lds.size.16.align.16.bc, align 16
; CHECK-NEXT: ret void

View File

@ -29,7 +29,7 @@
@llvm.compiler.used = appending global [2 x i8*] [i8* addrspacecast (i8 addrspace(3)* bitcast (float addrspace(3)* @tolower to i8 addrspace(3)*) to i8*), i8* addrspacecast (i8 addrspace(1)* bitcast (i64 addrspace(1)* @ignored to i8 addrspace(1)*) to i8*)], section "llvm.metadata"
; CHECK-LABEL: @func()
; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 1.000000e+00 monotonic, align 4
; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 1.000000e+00 monotonic, align 8
define void @func() {
%dec = atomicrmw fsub float addrspace(3)* @tolower, float 1.0 monotonic
%unused0 = atomicrmw add i64 addrspace(1)* @ignored, i64 1 monotonic

View File

@ -24,9 +24,9 @@
; Use in func rewritten to access struct at address zero
; CHECK-LABEL: @func()
; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 1.0
; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 4
; CHECK: %val0 = load i32, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 8
; CHECK: %val1 = add i32 %val0, 4
; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 4
; CHECK: store i32 %val1, i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 8
; CHECK: %unused0 = atomicrmw add i64 addrspace(3)* @with_init, i64 1 monotonic
define void @func() {
%dec = atomicrmw fsub float addrspace(3)* @var0, float 1.0 monotonic
@ -41,7 +41,7 @@ define void @func() {
; CHECK-LABEL: @kern_call()
; CHECK: call void @llvm.donothing() [ "ExplicitUse"(%llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds) ]
; CHECK: call void @func()
; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 2.000000e+00 monotonic, align 4
; CHECK: %dec = atomicrmw fsub float addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 0), float 2.000000e+00 monotonic, align 8
define amdgpu_kernel void @kern_call() {
call void @func()
%dec = atomicrmw fsub float addrspace(3)* @var0, float 2.0 monotonic

View File

@ -73,7 +73,7 @@
; LOWER_LDS-LABEL: @f1
; LOWER_LDS: %1 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2
; LOWER_LDS: %1 = load i16, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 16
; LOWER_LDS: %2 = getelementptr i8, i8 addrspace(3)* null, i16 %1
; LOWER_LDS: %3 = bitcast i8 addrspace(3)* %2 to i32 addrspace(3)*
; LOWER_LDS: store i32 7, i32 addrspace(3)* %3, align 4
@ -153,7 +153,7 @@ define void @f2() {
; LOWER_LDS: %4 = ptrtoint i64 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 0) to i16
; LOWER_LDS: store i16 %4, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 2), align 2
; LOWER_LDS: %5 = ptrtoint i32 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.kernel.k1.lds.t, %llvm.amdgcn.kernel.k1.lds.t addrspace(3)* @llvm.amdgcn.kernel.k1.lds, i32 0, i32 1) to i16
; LOWER_LDS: store i16 %5, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 2
; LOWER_LDS: store i16 %5, i16 addrspace(3)* getelementptr inbounds (%llvm.amdgcn.module.lds.t, %llvm.amdgcn.module.lds.t addrspace(3)* @llvm.amdgcn.module.lds, i32 0, i32 1), align 16
; LOWER_LDS: br label %6
;
; LOWER_LDS-LABEL: 6:
@ -177,11 +177,9 @@ define void @f2() {
; GCN: s_mov_b32 s32, 0
; GCN: s_and_saveexec_b64 s[0:1], vcc
; GCN: s_cbranch_execz BB2_2
; GCN: v_mov_b32_e32 v0, 24
; GCN: v_mov_b32_e32 v1, 0
; GCN: ds_write_b16 v1, v0 offset:18
; GCN: v_mov_b32_e32 v0, 32
; GCN: ds_write_b16 v1, v0 offset:16
; GCN: v_mov_b32_e32 v0, 0
; GCN: v_mov_b32_e32 v1, 0x180020
; GCN: ds_write_b32 v0, v1 offset:16
; GCN-LABEL: BB2_2:
; GCN: s_or_b64 exec, exec, s[0:1]
; GCN: s_getpc_b64 s[0:1]