[AMDGPU] Fix promote alloca which is already vector

Just do not touch loads and stores which are already vector. Previously pass was just unable to see these loads and stores because these were hidden bitcasts. Differential Revision: https://reviews.llvm.org/D79738
2024-11-25 20:23:11 +01:00 · 2020-05-11 12:09:16 -07:00 · 2020-05-11 12:09:16 -07:00 · 4a88942e02
commit 4a88942e02
parent 1acbe6fc41
2 changed files with 107 additions and 2 deletions
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@ -468,7 +468,7 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) {
    IRBuilder<> Builder(Inst);
    switch (Inst->getOpcode()) {
    case Instruction::Load: {
-      if (Inst->getType() == AllocaTy)
+      if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy())
        break;

      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
@ -486,7 +486,8 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL) {
    }
    case Instruction::Store: {
      StoreInst *SI = cast<StoreInst>(Inst);
-      if (SI->getValueOperand()->getType() == AllocaTy)
+      if (SI->getValueOperand()->getType() == AllocaTy ||
+          SI->getValueOperand()->getType()->isVectorTy())
        break;

      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
--- a/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
+++ b/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
@ -345,6 +345,110 @@ entry:
  ret void
 }

+; OPT-LABEL: @bitcast_vector_to_vector(
+; OPT-NOT:   alloca
+; OPT:       store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(1)* %out, align 16
+
+; GCN-LABEL: {{^}}bitcast_vector_to_vector:
+; GCN: v_mov_b32_e32 v0, 1
+; GCN: v_mov_b32_e32 v1, 2
+; GCN: v_mov_b32_e32 v2, 3
+; GCN: v_mov_b32_e32 v3, 4
+
+; GCN: ScratchSize: 0
+
+define amdgpu_kernel void @bitcast_vector_to_vector(<4 x i32> addrspace(1)* %out)  {
+.entry:
+  %alloca = alloca <4 x float>, align 16, addrspace(5)
+  %cast = bitcast <4 x float> addrspace(5)* %alloca to <4 x i32> addrspace(5)*
+  store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %cast
+  %load = load <4 x i32>, <4 x i32> addrspace(5)* %cast, align 16
+  store <4 x i32> %load, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @vector_bitcast_from_alloca_array(
+; OPT-NOT:   alloca
+; OPT:       store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(1)* %out, align 16
+
+; GCN-LABEL: {{^}}vector_bitcast_from_alloca_array:
+; GCN: v_mov_b32_e32 v0, 1
+; GCN: v_mov_b32_e32 v1, 2
+; GCN: v_mov_b32_e32 v2, 3
+; GCN: v_mov_b32_e32 v3, 4
+
+; GCN: ScratchSize: 0
+
+define amdgpu_kernel void @vector_bitcast_from_alloca_array(<4 x i32> addrspace(1)* %out)  {
+.entry:
+  %alloca = alloca [4 x float], align 16, addrspace(5)
+  %cast = bitcast [4 x float] addrspace(5)* %alloca to <4 x i32> addrspace(5)*
+  store <4 x i32> <i32 1, i32 2, i32 3, i32 4>, <4 x i32> addrspace(5)* %cast
+  %load = load <4 x i32>, <4 x i32> addrspace(5)* %cast, align 16
+  store <4 x i32> %load, <4 x i32> addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @vector_bitcast_to_array_from_alloca_array(
+; OPT-NOT:   alloca
+; OPT:      %out.repack = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 0
+; OPT-NEXT: store i32 1, i32 addrspace(1)* %out.repack, align 4
+; OPT-NEXT: %out.repack1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 1
+; OPT-NEXT: store i32 2, i32 addrspace(1)* %out.repack1, align 4
+; OPT-NEXT: %out.repack2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 2
+; OPT-NEXT: store i32 3, i32 addrspace(1)* %out.repack2, align 4
+; OPT-NEXT: %out.repack3 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(1)* %out, i64 0, i64 3
+; OPT-NEXT: store i32 4, i32 addrspace(1)* %out.repack3, align 4
+
+; GCN-LABEL: {{^}}vector_bitcast_to_array_from_alloca_array:
+; GCN: v_mov_b32_e32 v0, 1
+; GCN: v_mov_b32_e32 v1, 2
+; GCN: v_mov_b32_e32 v2, 3
+; GCN: v_mov_b32_e32 v3, 4
+
+; GCN: ScratchSize: 0
+
+define amdgpu_kernel void @vector_bitcast_to_array_from_alloca_array([4 x i32] addrspace(1)* %out)  {
+.entry:
+  %alloca = alloca [4 x float], align 16, addrspace(5)
+  %cast = bitcast [4 x float] addrspace(5)* %alloca to [4 x i32] addrspace(5)*
+  store [4 x i32] [i32 1, i32 2, i32 3, i32 4], [4 x i32] addrspace(5)* %cast
+  %load = load [4 x i32], [4 x i32] addrspace(5)* %cast, align 16
+  store [4 x i32] %load, [4 x i32] addrspace(1)* %out
+  ret void
+}
+
+; OPT-LABEL: @vector_bitcast_to_struct_from_alloca_array(
+; OPT-NOT:   alloca
+; OPT:      %out.repack = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 0
+; OPT-NEXT: store i32 1, i32 addrspace(1)* %out.repack, align 4
+; OPT-NEXT: %out.repack1 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 1
+; OPT-NEXT: store i32 2, i32 addrspace(1)* %out.repack1, align 4
+; OPT-NEXT: %out.repack2 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 2
+; OPT-NEXT: store i32 3, i32 addrspace(1)* %out.repack2, align 4
+; OPT-NEXT: %out.repack3 = getelementptr inbounds %struct.v4, %struct.v4 addrspace(1)* %out, i64 0, i32 3
+; OPT-NEXT: store i32 4, i32 addrspace(1)* %out.repack3, align 4
+
+; GCN-LABEL: {{^}}vector_bitcast_to_struct_from_alloca_array:
+; GCN: v_mov_b32_e32 v0, 1
+; GCN: v_mov_b32_e32 v1, 2
+; GCN: v_mov_b32_e32 v2, 3
+; GCN: v_mov_b32_e32 v3, 4
+
+; GCN: ScratchSize: 0
+
+%struct.v4 = type { i32, i32, i32, i32 }
+
+define amdgpu_kernel void @vector_bitcast_to_struct_from_alloca_array(%struct.v4 addrspace(1)* %out)  {
+.entry:
+  %alloca = alloca [4 x float], align 16, addrspace(5)
+  %cast = bitcast [4 x float] addrspace(5)* %alloca to %struct.v4 addrspace(5)*
+  store %struct.v4 { i32 1, i32 2, i32 3, i32 4 }, %struct.v4 addrspace(5)* %cast
+  %load = load %struct.v4, %struct.v4 addrspace(5)* %cast, align 16
+  store %struct.v4 %load, %struct.v4 addrspace(1)* %out
+  ret void
+}
+
 declare void @llvm.lifetime.start.p5i8(i64 immarg, i8 addrspace(5)* nocapture)

 declare void @llvm.lifetime.end.p5i8(i64 immarg, i8 addrspace(5)* nocapture)