From 7e6b33626b5ddaf5c0ee4e67fe5f9ed55d297dd3 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 30 May 2020 11:53:54 -0400 Subject: [PATCH] AMDGPU: Fix alignment for dynamic allocas The alignment value also needs to be scaled by the wave size. --- lib/Target/AMDGPU/SIISelLowering.cpp | 9 ++++++--- test/CodeGen/AMDGPU/non-entry-alloca.ll | 4 ++-- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index bbd3737d2ef..08effeea181 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -3126,9 +3126,12 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl( unsigned StackAlign = TFL->getStackAlignment(); Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value - if (Align > StackAlign) - Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, - DAG.getConstant(-(uint64_t)Align, dl, VT)); + if (Align > StackAlign) { + Tmp1 = DAG.getNode( + ISD::AND, dl, VT, Tmp1, + DAG.getConstant(-(uint64_t)Align << ST.getWavefrontSizeLog2(), dl, VT)); + } + Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain Tmp2 = DAG.getCALLSEQ_END( Chain, DAG.getIntPtrConstant(0, dl, true), diff --git a/test/CodeGen/AMDGPU/non-entry-alloca.ll b/test/CodeGen/AMDGPU/non-entry-alloca.ll index 060d66ae842..0cd60bd8203 100644 --- a/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -95,7 +95,7 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache ; GCN-NEXT: s_cbranch_scc1 BB1_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_add_i32 s6, s32, 0x1000 -; GCN-NEXT: s_andn2_b32 s6, s6, 63 +; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000 ; GCN-NEXT: s_lshl_b32 s7, s7, 2 ; GCN-NEXT: s_mov_b32 s32, s6 ; GCN-NEXT: v_mov_b32_e32 v2, s6 @@ -223,7 +223,7 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out, ; GCN-NEXT: s_cbranch_execz BB3_2 ; GCN-NEXT: ; %bb.1: ; %bb.0 ; GCN-NEXT: s_add_i32 s6, s32, 0x1000 -; GCN-NEXT: s_andn2_b32 s6, s6, 63 +; GCN-NEXT: s_and_b32 s6, s6, 0xfffff000 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v5, s6 ; GCN-NEXT: v_mov_b32_e32 v6, 1