mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-25 20:23:11 +01:00
[AMDGPU] SI_INDIRECT_DST_V* pseudos expansion should place EXEC restore to separate basic block
Summary: When SI_INDIRECT_DST_V* pseudos has indexes in VGPR, they get expanded into the self-looped basic block that modifies EXEC in a loop. To keep EXEC consistent it is stored before and then re-stored after the pseudo expansion result. %95:vreg_512 = SI_INDIRECT_DST_V16 %93:vreg_512(tied-def 0), %94:sreg_32, 0, killed %1500:vgpr_32 results to s_mov_b64 s[6:7], exec BB0_16: v_readfirstlane_b32 s8, v28 v_cmp_eq_u32_e32 vcc, s8, v28 s_and_saveexec_b64 vcc, vcc s_set_gpr_idx_on s8, gpr_idx(DST) v_mov_b32_e32 v6, v25 s_set_gpr_idx_off s_xor_b64 exec, exec, vcc s_cbranch_execnz BB0_16 ; %bb.17: s_mov_b64 exec, s[6:7] The bug appeared in case this expansion occurs in the ELSE block of the CF. Originally %110:vreg_512 = SI_INDIRECT_DST_V16 %103:vreg_512(tied-def 0), %85:vgpr_32, 0, %107:vgpr_32, %112:sreg_64 = SI_ELSE %108:sreg_64, %bb.19, 0, implicit-def dead $exec, implicit-def dead $scc, implicit $exec expanded to ****************** <== here exec has "THEN" context s_mov_b64 s[6:7], exec BB0_16: v_readfirstlane_b32 s8, v28 v_cmp_eq_u32_e32 vcc, s8, v28 s_and_saveexec_b64 vcc, vcc s_set_gpr_idx_on s8, gpr_idx(DST) v_mov_b32_e32 v6, v25 s_set_gpr_idx_off s_xor_b64 exec, exec, vcc s_cbranch_execnz BB0_16 ; %bb.17: s_or_saveexec_b64 s[4:5], s[4:5] <-- exec mask is restored for "ELSE" but immediately overwritten. s_mov_b64 exec, s[6:7] The rest of the "ELSE" block is executed not by the workitems which constitute the "else mask" but by those which constitute "then mask" SILowerControlFlow::emitElse always considers the basic block begin() as an insertion point for s_or_saveexec. Proposed fix: The SI_INDIRECT_DST_V* procedure should split the reminder block to create landing pad for the EXEC restoration. Reviewers: rampitec, vpykhtin, nhaehnle Reviewed By: vpykhtin Subscribers: arsenm, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, kerbowa, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D75472
This commit is contained in:
parent
cdd0eae6fc
commit
01c427b18c
@ -3293,9 +3293,15 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
|
|||||||
auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
|
auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
|
||||||
InitResultReg, DstReg, PhiReg, TmpExec,
|
InitResultReg, DstReg, PhiReg, TmpExec,
|
||||||
Offset, UseGPRIdxMode, IsIndirectSrc);
|
Offset, UseGPRIdxMode, IsIndirectSrc);
|
||||||
|
MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
|
||||||
MachineBasicBlock::iterator First = RemainderBB->begin();
|
MachineFunction::iterator MBBI(LoopBB);
|
||||||
BuildMI(*RemainderBB, First, DL, TII->get(MovExecOpc), Exec)
|
++MBBI;
|
||||||
|
MF->insert(MBBI, LandingPad);
|
||||||
|
LoopBB->removeSuccessor(RemainderBB);
|
||||||
|
LandingPad->addSuccessor(RemainderBB);
|
||||||
|
LoopBB->addSuccessor(LandingPad);
|
||||||
|
MachineBasicBlock::iterator First = LandingPad->begin();
|
||||||
|
BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
|
||||||
.addReg(SaveExec);
|
.addReg(SaveExec);
|
||||||
|
|
||||||
return InsPt;
|
return InsPt;
|
||||||
|
@ -79,7 +79,7 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) {
|
|||||||
; GCN: SI_SPILL_V32_SAVE killed $vgpr1, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5)
|
; GCN: SI_SPILL_V32_SAVE killed $vgpr1, %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.4, addrspace 5)
|
||||||
; GCN: SI_SPILL_S64_SAVE killed $sgpr24_sgpr25, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (store 8 into %stack.5, align 4, addrspace 5)
|
; GCN: SI_SPILL_S64_SAVE killed $sgpr24_sgpr25, %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (store 8 into %stack.5, align 4, addrspace 5)
|
||||||
; GCN: bb.1:
|
; GCN: bb.1:
|
||||||
; GCN: successors: %bb.1(0x40000000), %bb.2(0x40000000)
|
; GCN: successors: %bb.1(0x40000000), %bb.3(0x40000000)
|
||||||
; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (load 8 from %stack.5, align 4, addrspace 5)
|
; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.5, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (load 8 from %stack.5, align 4, addrspace 5)
|
||||||
; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5)
|
; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.4, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.4, addrspace 5)
|
||||||
; GCN: $vgpr1 = SI_SPILL_V32_RESTORE %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
|
; GCN: $vgpr1 = SI_SPILL_V32_RESTORE %stack.0, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5)
|
||||||
@ -99,9 +99,11 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) {
|
|||||||
; GCN: SI_SPILL_V32_SAVE killed $vgpr18, %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.8, addrspace 5)
|
; GCN: SI_SPILL_V32_SAVE killed $vgpr18, %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (store 4 into %stack.8, addrspace 5)
|
||||||
; GCN: $exec = S_XOR_B64_term $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
|
; GCN: $exec = S_XOR_B64_term $exec, killed renamable $sgpr4_sgpr5, implicit-def $scc
|
||||||
; GCN: S_CBRANCH_EXECNZ %bb.1, implicit $exec
|
; GCN: S_CBRANCH_EXECNZ %bb.1, implicit $exec
|
||||||
; GCN: bb.2:
|
; GCN: bb.3:
|
||||||
|
; GCN: successors: %bb.2(0x80000000)
|
||||||
; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (load 8 from %stack.3, align 4, addrspace 5)
|
; GCN: $sgpr0_sgpr1 = SI_SPILL_S64_RESTORE %stack.3, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (load 8 from %stack.3, align 4, addrspace 5)
|
||||||
; GCN: $exec = S_MOV_B64 renamable $sgpr0_sgpr1
|
; GCN: $exec = S_MOV_B64 killed renamable $sgpr0_sgpr1
|
||||||
|
; GCN: bb.2:
|
||||||
; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.8, addrspace 5)
|
; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.8, addrspace 5)
|
||||||
; GCN: $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (load 16 from %stack.1, align 4, addrspace 5)
|
; GCN: $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3 :: (load 16 from %stack.1, align 4, addrspace 5)
|
||||||
; GCN: BUFFER_STORE_DWORD_OFFSET renamable $vgpr0, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1)
|
; GCN: BUFFER_STORE_DWORD_OFFSET renamable $vgpr0, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1)
|
||||||
|
Loading…
Reference in New Issue
Block a user