1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-25 20:23:11 +01:00

[AMDGPU] Fix promote alloca which is already vector

Just do not touch loads and stores which are already vector.
Previously pass was just unable to see these loads and stores
because these were hidden bitcasts.

Differential Revision: https://reviews.llvm.org/D79738
This commit is contained in:
Stanislav Mekhanoshin 2020-05-11 12:09:16 -07:00
parent 1acbe6fc41
commit 4a88942e02
2 changed files with 107 additions and 2 deletions

View File

@ -468,7 +468,7 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) {
IRBuilder<> Builder(Inst);
switch (Inst->getOpcode()) {
case Instruction::Load: {
if (Inst->getType() == AllocaTy)
if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy())
break;
Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
@ -486,7 +486,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) {
}
case Instruction::Store: {
StoreInst *SI = cast<StoreInst>(Inst);
if (SI->getValueOperand()->getType() == AllocaTy)
if (SI->getValueOperand()->getType() == AllocaTy ||
SI->getValueOperand()->getType()->isVectorTy())
break;
Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);

View File

@ -345,6 +345,110 @@ entry:
ret void
}
; OPT-LABEL: @bitcast_vector_to_vector(
; OPT-NOT: alloca
; OPT: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(1)* %out, align 16
; GCN-LABEL: {{^}}bitcast_vector_to_vector:
; GCN: v_mov_b32_e32 v0, 1
; GCN: v_mov_b32_e32 v1, 2
; GCN: v_mov_b32_e32 v2, 3
; GCN: v_mov_b32_e32 v3, 4
; GCN: ScratchSize: 0
define amdgpu_kernel void @bitcast_vector_to_vector(<4 x i32> addrspace(1)* %out) {
.entry:
%alloca = alloca <4 x float>, align 16, addrspace(5)
%cast = bitcast <4 x float> addrspace(5)* %alloca to <4 x i32> addrspace(5)*
store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %cast
%load = load <4 x i32>, <4 x i32> addrspace(5)* %cast, align 16
store <4 x i32> %load, <4 x i32> addrspace(1)* %out
ret void
}
; OPT-LABEL: @vector_bitcast_from_alloca_array(
; OPT-NOT: alloca
; OPT: store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(1)* %out, align 16
; GCN-LABEL: {{^}}vector_bitcast_from_alloca_array:
; GCN: v_mov_b32_e32 v0, 1
; GCN: v_mov_b32_e32 v1, 2
; GCN: v_mov_b32_e32 v2, 3
; GCN: v_mov_b32_e32 v3, 4
; GCN: ScratchSize: 0
define amdgpu_kernel void @vector_bitcast_from_alloca_array(<4 x i32> addrspace(1)* %out) {
.entry:
%alloca = alloca [4 x float], align 16, addrspace(5)
%cast = bitcast [4 x float] addrspace(5)* %alloca to <4 x i32> addrspace(5)*
store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %cast
%load = load <4 x i32>, <4 x i32> addrspace(5)* %cast, align 16
store <4 x i32> %load, <4 x i32> addrspace(1)* %out
ret void
}
; OPT-LABEL: @vector_bitcast_to_array_from_alloca_array(
; OPT-NOT: alloca
; OPT: %out.repack = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 0
; OPT-NEXT: store i32 1, i32 addrspace(1)* %out.repack, align 4
; OPT-NEXT: %out.repack1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 1
; OPT-NEXT: store i32 2, i32 addrspace(1)* %out.repack1, align 4
; OPT-NEXT: %out.repack2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 2
; OPT-NEXT: store i32 3, i32 addrspace(1)* %out.repack2, align 4
; OPT-NEXT: %out.repack3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 3
; OPT-NEXT: store i32 4, i32 addrspace(1)* %out.repack3, align 4
; GCN-LABEL: {{^}}vector_bitcast_to_array_from_alloca_array:
; GCN: v_mov_b32_e32 v0, 1
; GCN: v_mov_b32_e32 v1, 2
; GCN: v_mov_b32_e32 v2, 3
; GCN: v_mov_b32_e32 v3, 4
; GCN: ScratchSize: 0
define amdgpu_kernel void @vector_bitcast_to_array_from_alloca_array([4 x i32] addrspace(1)* %out) {
.entry:
%alloca = alloca [4 x float], align 16, addrspace(5)
%cast = bitcast [4 x float] addrspace(5)* %alloca to [4 x i32] addrspace(5)*
store [4 x i32] [i32 1, i32 2, i32 3, i32 4], [4 x i32] addrspace(5)* %cast
%load = load [4 x i32], [4 x i32] addrspace(5)* %cast, align 16
store [4 x i32] %load, [4 x i32] addrspace(1)* %out
ret void
}
; OPT-LABEL: @vector_bitcast_to_struct_from_alloca_array(
; OPT-NOT: alloca
; OPT: %out.repack = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 0
; OPT-NEXT: store i32 1, i32 addrspace(1)* %out.repack, align 4
; OPT-NEXT: %out.repack1 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 1
; OPT-NEXT: store i32 2, i32 addrspace(1)* %out.repack1, align 4
; OPT-NEXT: %out.repack2 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 2
; OPT-NEXT: store i32 3, i32 addrspace(1)* %out.repack2, align 4
; OPT-NEXT: %out.repack3 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 3
; OPT-NEXT: store i32 4, i32 addrspace(1)* %out.repack3, align 4
; GCN-LABEL: {{^}}vector_bitcast_to_struct_from_alloca_array:
; GCN: v_mov_b32_e32 v0, 1
; GCN: v_mov_b32_e32 v1, 2
; GCN: v_mov_b32_e32 v2, 3
; GCN: v_mov_b32_e32 v3, 4
; GCN: ScratchSize: 0
%struct.v4 = type { i32, i32, i32, i32 }
define amdgpu_kernel void @vector_bitcast_to_struct_from_alloca_array(%struct.v4 addrspace(1)* %out) {
.entry:
%alloca = alloca [4 x float], align 16, addrspace(5)
%cast = bitcast [4 x float] addrspace(5)* %alloca to %struct.v4 addrspace(5)*
store %struct.v4 { i32 1, i32 2, i32 3, i32 4 }, %struct.v4 addrspace(5)* %cast
%load = load %struct.v4, %struct.v4 addrspace(5)* %cast, align 16
store %struct.v4 %load, %struct.v4 addrspace(1)* %out
ret void
}
declare void @llvm.lifetime.start.p5i8(i64 immarg, i8 addrspace(5)* nocapture)
declare void @llvm.lifetime.end.p5i8(i64 immarg, i8 addrspace(5)* nocapture)