From ae477cc7747acbb80b543e25dcbb9dc6ec7bf42f Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Wed, 22 Jan 2014 21:55:43 +0000 Subject: [PATCH] R600: Refactor stack size calculation reviewed-by: Vincent Lejeune llvm-svn: 199840 --- lib/Target/R600/AMDGPUSubtarget.cpp | 17 ++ lib/Target/R600/AMDGPUSubtarget.h | 1 + lib/Target/R600/R600ControlFlowFinalizer.cpp | 175 +++++++++++++++---- test/CodeGen/R600/elf.r600.ll | 2 +- 4 files changed, 158 insertions(+), 37 deletions(-) diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp index ed7742cdc2a..f36aa2071c7 100644 --- a/lib/Target/R600/AMDGPUSubtarget.cpp +++ b/lib/Target/R600/AMDGPUSubtarget.cpp @@ -79,6 +79,23 @@ unsigned AMDGPUSubtarget::getWavefrontSize() const { return WavefrontSize; } +unsigned +AMDGPUSubtarget::getStackEntrySize() const { + assert(getGeneration() <= NORTHERN_ISLANDS); + switch(getWavefrontSize()) { + case 16: + return 8; + case 32: + if (hasCaymanISA()) + return 4; + else + return 8; + case 64: + return 4; + default: + llvm_unreachable("Illegal wavefront size."); + } +} bool AMDGPUSubtarget::isTargetELF() const { return false; diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h index 53c7d2531a0..68d853218ba 100644 --- a/lib/Target/R600/AMDGPUSubtarget.h +++ b/lib/Target/R600/AMDGPUSubtarget.h @@ -70,6 +70,7 @@ public: bool IsIRStructurizerEnabled() const; bool isIfCvtEnabled() const; unsigned getWavefrontSize() const; + unsigned getStackEntrySize() const; virtual bool enableMachineScheduler() const { return getGeneration() <= NORTHERN_ISLANDS; diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp index 56dc90eaa11..6b42a7a9faf 100644 --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -28,6 +28,134 @@ using namespace llvm; namespace { +struct CFStack { + + enum StackItem { + ENTRY = 0, + SUB_ENTRY = 1, + FIRST_NON_WQM_PUSH = 2, + FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3 + }; + + const AMDGPUSubtarget &ST; + std::vector BranchStack; + std::vector LoopStack; + unsigned MaxStackSize; + unsigned CurrentEntries; + unsigned CurrentSubEntries; + + CFStack(const AMDGPUSubtarget &st, unsigned ShaderType) : ST(st), + // We need to reserve a stack entry for CALL_FS in vertex shaders. + MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0), + CurrentEntries(0), CurrentSubEntries(0) { } + + unsigned getLoopDepth(); + bool branchStackContains(CFStack::StackItem); + bool requiresWorkAroundForInst(unsigned Opcode); + unsigned getSubEntrySize(CFStack::StackItem Item); + void updateMaxStackSize(); + void pushBranch(unsigned Opcode, bool isWQM = false); + void pushLoop(); + void popBranch(); + void popLoop(); +}; + +unsigned CFStack::getLoopDepth() { + return LoopStack.size(); +} + +bool CFStack::branchStackContains(CFStack::StackItem Item) { + for (std::vector::const_iterator I = BranchStack.begin(), + E = BranchStack.end(); I != E; ++I) { + if (*I == Item) + return true; + } + return false; +} + +unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) { + switch(Item) { + default: + return 0; + case CFStack::FIRST_NON_WQM_PUSH: + assert(!ST.hasCaymanISA()); + if (ST.getGeneration() <= AMDGPUSubtarget::R700) { + // +1 For the push operation. + // +2 Extra space required. + return 3; + } else { + // Some documentation says that this is not necessary on Evergreen, + // but experimentation has show that we need to allocate 1 extra + // sub-entry for the first non-WQM push. + // +1 For the push operation. + // +1 Extra space required. + return 2; + } + case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY: + assert(ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN); + // +1 For the push operation. + // +1 Extra space required. + return 2; + case CFStack::SUB_ENTRY: + return 1; + } +} + +void CFStack::updateMaxStackSize() { + unsigned CurrentStackSize = CurrentEntries + + (RoundUpToAlignment(CurrentSubEntries, 4) / 4); + MaxStackSize = std::max(CurrentStackSize, MaxStackSize); +} + +void CFStack::pushBranch(unsigned Opcode, bool isWQM) { + CFStack::StackItem Item = CFStack::ENTRY; + switch(Opcode) { + case AMDGPU::CF_PUSH_EG: + case AMDGPU::CF_ALU_PUSH_BEFORE: + if (!isWQM) { + if (!ST.hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH)) + Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI + // See comment in + // CFStack::getSubEntrySize() + else if (CurrentEntries > 0 && + ST.getGeneration() > AMDGPUSubtarget::EVERGREEN && + !ST.hasCaymanISA() && + !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY)) + Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY; + else + Item = CFStack::SUB_ENTRY; + } else + Item = CFStack::ENTRY; + break; + } + BranchStack.push_back(Item); + if (Item == CFStack::ENTRY) + CurrentEntries++; + else + CurrentSubEntries += getSubEntrySize(Item); + updateMaxStackSize(); +} + +void CFStack::pushLoop() { + LoopStack.push_back(CFStack::ENTRY); + CurrentEntries++; + updateMaxStackSize(); +} + +void CFStack::popBranch() { + CFStack::StackItem Top = BranchStack.back(); + if (Top == CFStack::ENTRY) + CurrentEntries--; + else + CurrentSubEntries-= getSubEntrySize(Top); + BranchStack.pop_back(); +} + +void CFStack::popLoop() { + CurrentEntries--; + LoopStack.pop_back(); +} + class R600ControlFlowFinalizer : public MachineFunctionPass { private: @@ -300,24 +428,6 @@ private: } } - unsigned getHWStackSize(unsigned StackSubEntry, bool hasPush) const { - switch (ST.getGeneration()) { - case AMDGPUSubtarget::R600: - case AMDGPUSubtarget::R700: - if (hasPush) - StackSubEntry += 2; - break; - case AMDGPUSubtarget::EVERGREEN: - if (hasPush) - StackSubEntry ++; - case AMDGPUSubtarget::NORTHERN_ISLANDS: - StackSubEntry += 2; - break; - default: llvm_unreachable("Not a VLIW4/VLIW5 GPU"); - } - return (StackSubEntry + 3)/4; // Need ceil value of StackSubEntry/4 - } - public: R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID), TII (0), TRI(0), @@ -329,23 +439,19 @@ public: virtual bool runOnMachineFunction(MachineFunction &MF) { TII=static_cast(MF.getTarget().getInstrInfo()); TRI=static_cast(MF.getTarget().getRegisterInfo()); + R600MachineFunctionInfo *MFI = MF.getInfo(); - unsigned MaxStack = 0; - unsigned CurrentStack = 0; - unsigned CurrentLoopDepth = 0; - bool HasPush = false; + CFStack CFStack(ST, MFI->ShaderType); for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; ++MB) { MachineBasicBlock &MBB = *MB; unsigned CfCount = 0; std::vector > > LoopStack; std::vector IfThenElseStack; - R600MachineFunctionInfo *MFI = MF.getInfo(); if (MFI->ShaderType == 1) { BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), getHWInstrDesc(CF_CALL_FS)); CfCount++; - MaxStack = 1; } std::vector FetchClauses, AluClauses; std::vector LastAlu(1); @@ -368,16 +474,16 @@ public: I++; switch (MI->getOpcode()) { case AMDGPU::CF_ALU_PUSH_BEFORE: - CurrentStack++; - MaxStack = std::max(MaxStack, CurrentStack); - HasPush = true; - if (ST.hasCaymanISA() && CurrentLoopDepth > 1) { + if (ST.hasCaymanISA() && CFStack.getLoopDepth() > 1) { BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG)) .addImm(CfCount + 1) .addImm(1); MI->setDesc(TII->get(AMDGPU::CF_ALU)); CfCount++; - } + CFStack.pushBranch(AMDGPU::CF_PUSH_EG); + } else + CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE); + case AMDGPU::CF_ALU: I = MI; AluClauses.push_back(MakeALUClause(MBB, I)); @@ -385,9 +491,7 @@ public: CfCount++; break; case AMDGPU::WHILELOOP: { - CurrentStack+=4; - CurrentLoopDepth++; - MaxStack = std::max(MaxStack, CurrentStack); + CFStack.pushLoop(); MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI), getHWInstrDesc(CF_WHILE_LOOP)) .addImm(1); @@ -400,8 +504,7 @@ public: break; } case AMDGPU::ENDLOOP: { - CurrentStack-=4; - CurrentLoopDepth--; + CFStack.popLoop(); std::pair > Pair = LoopStack.back(); LoopStack.pop_back(); @@ -439,7 +542,7 @@ public: break; } case AMDGPU::ENDIF: { - CurrentStack--; + CFStack.popBranch(); if (LastAlu.back()) { ToPopAfter.push_back(LastAlu.back()); } else { @@ -514,7 +617,7 @@ public: .addImm(Alu->getOperand(8).getImm()); Alu->eraseFromParent(); } - MFI->StackSize = getHWStackSize(MaxStack, HasPush); + MFI->StackSize = CFStack.MaxStackSize; } return false; diff --git a/test/CodeGen/R600/elf.r600.ll b/test/CodeGen/R600/elf.r600.ll index 0590efb0915..4436c07c5a7 100644 --- a/test/CodeGen/R600/elf.r600.ll +++ b/test/CodeGen/R600/elf.r600.ll @@ -6,7 +6,7 @@ ; CONFIG-CHECK: .section .AMDGPU.config ; CONFIG-CHECK-NEXT: .long 166100 -; CONFIG-CHECK-NEXT: .long 258 +; CONFIG-CHECK-NEXT: .long 2 ; CONFIG-CHECK-NEXT: .long 165900 ; CONFIG-CHECK-NEXT: .long 0 define void @test(float addrspace(1)* %out, i32 %p) {