From 5517f57c4217ac21d376e4e2562f6afab4d37093 Mon Sep 17 00:00:00 2001 From: Vincent Lejeune Date: Tue, 9 Jul 2013 15:03:33 +0000 Subject: [PATCH] R600: Do not predicated basic block with multiple alu clause Test is not included as it is several 1000 lines long. To test this functionnality, a test case must generate at least 2 ALU clauses, where an ALU clause is ~110 instructions long. NOTE: This is a candidate for the stable branch. llvm-svn: 185943 --- lib/Target/R600/AMDGPUTargetMachine.cpp | 5 ++- lib/Target/R600/R600ControlFlowFinalizer.cpp | 2 + lib/Target/R600/R600EmitClauseMarkers.cpp | 14 ++++-- lib/Target/R600/R600InstrInfo.cpp | 45 ++++++++++++++++++++ lib/Target/R600/R600Instructions.td | 2 +- lib/Target/R600/R600Packetizer.cpp | 3 +- test/CodeGen/R600/jump-address.ll | 2 +- 7 files changed, 65 insertions(+), 8 deletions(-) diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index 90f72de8ea7..7a14e50f86d 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -148,7 +148,11 @@ bool AMDGPUPassConfig::addPostRegAlloc() { } bool AMDGPUPassConfig::addPreSched2() { + const AMDGPUSubtarget &ST = TM->getSubtarget(); + if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { + addPass(createR600EmitClauseMarkers(*TM)); + } addPass(&IfConverterID); return false; } @@ -158,7 +162,6 @@ bool AMDGPUPassConfig::addPreEmitPass() { if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) { addPass(createAMDGPUCFGPreparationPass(*TM)); addPass(createAMDGPUCFGStructurizerPass(*TM)); - addPass(createR600EmitClauseMarkers(*TM)); addPass(createR600ExpandSpecialInstrsPass(*TM)); addPass(&FinalizeMachineBundlesID); addPass(createR600Packetizer(*TM)); diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp index 887c808d5b3..932a6a7243c 100644 --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -256,6 +256,7 @@ private: ClauseContent.push_back(MILit); } } + assert(ClauseContent.size() < 128 && "ALU clause is too big"); ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1); return ClauseFile(ClauseHead, ClauseContent); } @@ -276,6 +277,7 @@ private: void EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause, unsigned &CfCount) { + Clause.first->getOperand(0).setImm(0); CounterPropagateAddr(Clause.first, CfCount); MachineBasicBlock *BB = Clause.first->getParent(); BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE)) diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp index 0aea2d7c030..c1da64cac48 100644 --- a/lib/Target/R600/R600EmitClauseMarkers.cpp +++ b/lib/Target/R600/R600EmitClauseMarkers.cpp @@ -32,6 +32,7 @@ class R600EmitClauseMarkersPass : public MachineFunctionPass { private: static char ID; const R600InstrInfo *TII; + int Address; unsigned OccupiedDwords(MachineInstr *MI) const { switch (MI->getOpcode()) { @@ -159,7 +160,7 @@ private: } MachineBasicBlock::iterator - MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const { + MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) { MachineBasicBlock::iterator ClauseHead = I; std::vector > KCacheBanks; bool PushBeforeModifier = false; @@ -199,20 +200,25 @@ private: unsigned Opcode = PushBeforeModifier ? AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU; BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode)) - .addImm(0) // ADDR + // We don't use the ADDR field until R600ControlFlowFinalizer pass, where + // it is safe to assume it is 0. However if we always put 0 here, the ifcvt + // pass may assume that identical ALU clause starter at the beginning of a + // true and false branch can be factorized which is not the case. + .addImm(Address++) // ADDR .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0 .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1 .addImm(KCacheBanks.empty()?0:2) // KM0 .addImm((KCacheBanks.size() < 2)?0:2) // KM1 .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0 .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1 - .addImm(AluInstCount); // COUNT + .addImm(AluInstCount) // COUNT + .addImm(1); // Enabled return I; } public: R600EmitClauseMarkersPass(TargetMachine &tm) : MachineFunctionPass(ID), - TII(0) { } + TII(0), Address(0) { } virtual bool runOnMachineFunction(MachineFunction &MF) { TII = static_cast(MF.getTarget().getInstrInfo()); diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index 969a7ce2b55..d0935fa3d28 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -651,6 +651,17 @@ int R600InstrInfo::getBranchInstr(const MachineOperand &op) const { }; } +static +MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) { + for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend(); + It != E; ++It) { + if (It->getOpcode() == AMDGPU::CF_ALU || + It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE) + return llvm::prior(It.base()); + } + return MBB.end(); +} + unsigned R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, @@ -672,6 +683,11 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND)) .addMBB(TBB) .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); + MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); + if (CfAlu == MBB.end()) + return 1; + assert (CfAlu->getOpcode() == AMDGPU::CF_ALU); + CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE)); return 1; } } else { @@ -683,6 +699,11 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB, .addMBB(TBB) .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill); BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB); + MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); + if (CfAlu == MBB.end()) + return 2; + assert (CfAlu->getOpcode() == AMDGPU::CF_ALU); + CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE)); return 2; } } @@ -706,6 +727,11 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); clearFlag(predSet, 0, MO_FLAG_PUSH); I->eraseFromParent(); + MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); + if (CfAlu == MBB.end()) + break; + assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE); + CfAlu->setDesc(get(AMDGPU::CF_ALU)); break; } case AMDGPU::JUMP: @@ -726,6 +752,11 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I); clearFlag(predSet, 0, MO_FLAG_PUSH); I->eraseFromParent(); + MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB); + if (CfAlu == MBB.end()) + break; + assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE); + CfAlu->setDesc(get(AMDGPU::CF_ALU)); break; } case AMDGPU::JUMP: @@ -760,6 +791,15 @@ R600InstrInfo::isPredicable(MachineInstr *MI) const { if (MI->getOpcode() == AMDGPU::KILLGT) { return false; + } else if (MI->getOpcode() == AMDGPU::CF_ALU) { + // If the clause start in the middle of MBB then the MBB has more + // than a single clause, unable to predicate several clauses. + if (MI->getParent()->begin() != MachineBasicBlock::iterator(MI)) + return false; + // TODO: We don't support KC merging atm + if (MI->getOperand(3).getImm() != 0 || MI->getOperand(4).getImm() != 0) + return false; + return true; } else if (isVector(*MI)) { return false; } else { @@ -855,6 +895,11 @@ R600InstrInfo::PredicateInstruction(MachineInstr *MI, const SmallVectorImpl &Pred) const { int PIdx = MI->findFirstPredOperandIdx(); + if (MI->getOpcode() == AMDGPU::CF_ALU) { + MI->getOperand(8).setImm(0); + return true; + } + if (PIdx != -1) { MachineOperand &PMO = MI->getOperand(PIdx); PMO.setReg(Pred[2].getReg()); diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 735dcfc0256..df5c438d51a 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -563,7 +563,7 @@ class ALU_CLAUSE inst, string OpName> : AMDGPUInst <(outs), (ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1, KCACHE:$KCACHE_MODE0, KCACHE:$KCACHE_MODE1, i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1, -i32imm:$COUNT), +i32imm:$COUNT, i32imm:$Enabled), !strconcat(OpName, " $COUNT, @$ADDR, " "KC0[$KCACHE_MODE0], KC1[$KCACHE_MODE1]"), [] >, CF_ALU_WORD0, CF_ALU_WORD1 { diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp index 5ee51faea87..f4219bd476c 100644 --- a/lib/Target/R600/R600Packetizer.cpp +++ b/lib/Target/R600/R600Packetizer.cpp @@ -304,7 +304,8 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) { MachineBasicBlock::iterator End = MBB->end(); MachineBasicBlock::iterator MI = MBB->begin(); while (MI != End) { - if (MI->isKill()) { + if (MI->isKill() || + (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) { MachineBasicBlock::iterator DeleteMI = MI; ++MI; MBB->erase(DeleteMI); diff --git a/test/CodeGen/R600/jump-address.ll b/test/CodeGen/R600/jump-address.ll index ae9c8bba4fd..9a5f1bc3acb 100644 --- a/test/CodeGen/R600/jump-address.ll +++ b/test/CodeGen/R600/jump-address.ll @@ -1,6 +1,6 @@ ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s -; CHECK: JUMP @3 +; CHECK: JUMP @7 ; CHECK: EXPORT ; CHECK-NOT: EXPORT