mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 19:23:23 +01:00
AMDGPU/SI: Improve SILoadStoreOptimizer and run it before the scheduler
Summary: The SILoadStoreOptimizer can now look ahead more then one instruction when looking for instructions to merge, which greatly improves the number of loads/stores that we are able to merge. Moving the pass before scheduling avoids increasing register pressure after the scheduler, so that the scheduler's register pressure estimates will be more accurate. It also gives more consistent results, since it is no longer affected by minor scheduling changes. Reviewers: arsenm Subscribers: arsenm, kzhuravl, llvm-commits Differential Revision: https://reviews.llvm.org/D23814 llvm-svn: 279991
This commit is contained in:
parent
2fbd6a61b4
commit
261d5c31cf
@ -497,6 +497,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
|
||||
// XXX - Can we get away without running DeadMachineInstructionElim again?
|
||||
addPass(&SIFoldOperandsID);
|
||||
addPass(&DeadMachineInstructionElimID);
|
||||
addPass(&SILoadStoreOptimizerID);
|
||||
}
|
||||
|
||||
void GCNPassConfig::addIRPasses() {
|
||||
@ -533,17 +534,6 @@ bool GCNPassConfig::addGlobalInstructionSelect() {
|
||||
#endif
|
||||
|
||||
void GCNPassConfig::addPreRegAlloc() {
|
||||
if (getOptLevel() > CodeGenOpt::None) {
|
||||
// Don't do this with no optimizations since it throws away debug info by
|
||||
// merging nonadjacent loads.
|
||||
|
||||
// This should be run after scheduling, but before register allocation. It
|
||||
// also need extra copies to the address operand to be eliminated.
|
||||
|
||||
// FIXME: Move pre-RA and remove extra reg coalescer run.
|
||||
insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
|
||||
insertPass(&MachineSchedulerID, &RegisterCoalescerID);
|
||||
}
|
||||
|
||||
addPass(createSIShrinkInstructionsPass());
|
||||
addPass(createSIWholeQuadModePass());
|
||||
|
@ -60,31 +60,35 @@ private:
|
||||
const SIInstrInfo *TII;
|
||||
const SIRegisterInfo *TRI;
|
||||
MachineRegisterInfo *MRI;
|
||||
LiveIntervals *LIS;
|
||||
AliasAnalysis *AA;
|
||||
|
||||
static bool offsetsCanBeCombined(unsigned Offset0,
|
||||
unsigned Offset1,
|
||||
unsigned EltSize);
|
||||
|
||||
MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I,
|
||||
unsigned EltSize);
|
||||
MachineBasicBlock::iterator findMatchingDSInst(
|
||||
MachineBasicBlock::iterator I,
|
||||
unsigned EltSize,
|
||||
SmallVectorImpl<MachineInstr*> &InstsToMove);
|
||||
|
||||
MachineBasicBlock::iterator mergeRead2Pair(
|
||||
MachineBasicBlock::iterator I,
|
||||
MachineBasicBlock::iterator Paired,
|
||||
unsigned EltSize);
|
||||
unsigned EltSize,
|
||||
ArrayRef<MachineInstr*> InstsToMove);
|
||||
|
||||
MachineBasicBlock::iterator mergeWrite2Pair(
|
||||
MachineBasicBlock::iterator I,
|
||||
MachineBasicBlock::iterator Paired,
|
||||
unsigned EltSize);
|
||||
unsigned EltSize,
|
||||
ArrayRef<MachineInstr*> InstsToMove);
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
SILoadStoreOptimizer()
|
||||
: MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr),
|
||||
LIS(nullptr) {}
|
||||
AA(nullptr) {}
|
||||
|
||||
SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) {
|
||||
initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
|
||||
@ -98,17 +102,9 @@ public:
|
||||
return "SI Load / Store Optimizer";
|
||||
}
|
||||
|
||||
MachineFunctionProperties getRequiredProperties() const override {
|
||||
return MachineFunctionProperties().set(
|
||||
MachineFunctionProperties::Property::NoPHIs);
|
||||
}
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.setPreservesCFG();
|
||||
AU.addPreserved<SlotIndexes>();
|
||||
AU.addPreserved<LiveIntervals>();
|
||||
AU.addPreserved<LiveVariables>();
|
||||
AU.addRequired<LiveIntervals>();
|
||||
AU.addRequired<AAResultsWrapperPass>();
|
||||
|
||||
MachineFunctionPass::getAnalysisUsage(AU);
|
||||
}
|
||||
@ -118,9 +114,7 @@ public:
|
||||
|
||||
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
|
||||
"SI Load / Store Optimizer", false, false)
|
||||
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
|
||||
INITIALIZE_PASS_DEPENDENCY(LiveVariables)
|
||||
INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
|
||||
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
|
||||
INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
|
||||
"SI Load / Store Optimizer", false, false)
|
||||
|
||||
@ -132,6 +126,40 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) {
|
||||
return new SILoadStoreOptimizer(TM);
|
||||
}
|
||||
|
||||
static void moveInstsAfter(MachineBasicBlock::iterator I,
|
||||
ArrayRef<MachineInstr*> InstsToMove) {
|
||||
MachineBasicBlock *MBB = I->getParent();
|
||||
++I;
|
||||
for (MachineInstr *MI : InstsToMove) {
|
||||
MI->removeFromParent();
|
||||
MBB->insert(I, MI);
|
||||
}
|
||||
}
|
||||
|
||||
static void addDefsToList(const MachineInstr &MI,
|
||||
SmallVectorImpl<const MachineOperand *> &Defs) {
|
||||
for (const MachineOperand &Def : MI.defs()) {
|
||||
Defs.push_back(&Def);
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
canMoveInstsAcrossMemOp(MachineInstr &MemOp,
|
||||
ArrayRef<MachineInstr*> InstsToMove,
|
||||
const SIInstrInfo *TII,
|
||||
AliasAnalysis *AA) {
|
||||
|
||||
assert(MemOp.mayLoadOrStore());
|
||||
|
||||
for (MachineInstr *InstToMove : InstsToMove) {
|
||||
if (!InstToMove->mayLoadOrStore())
|
||||
continue;
|
||||
if (!TII->areMemAccessesTriviallyDisjoint(MemOp, *InstToMove, AA))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
|
||||
unsigned Offset1,
|
||||
unsigned Size) {
|
||||
@ -161,44 +189,98 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
|
||||
|
||||
MachineBasicBlock::iterator
|
||||
SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
|
||||
unsigned EltSize){
|
||||
unsigned EltSize,
|
||||
SmallVectorImpl<MachineInstr*> &InstsToMove) {
|
||||
MachineBasicBlock::iterator E = I->getParent()->end();
|
||||
MachineBasicBlock &MBB = *I->getParent();
|
||||
MachineBasicBlock::iterator MBBI = I;
|
||||
++MBBI;
|
||||
|
||||
if (MBBI == MBB.end() || MBBI->getOpcode() != I->getOpcode())
|
||||
return E;
|
||||
SmallVector<const MachineOperand *, 8> DefsToMove;
|
||||
addDefsToList(*I, DefsToMove);
|
||||
|
||||
// Don't merge volatiles.
|
||||
if (MBBI->hasOrderedMemoryRef())
|
||||
return E;
|
||||
for ( ; MBBI != E; ++MBBI) {
|
||||
|
||||
int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
|
||||
const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
|
||||
const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
|
||||
if (MBBI->getOpcode() != I->getOpcode()) {
|
||||
|
||||
// Check same base pointer. Be careful of subregisters, which can occur with
|
||||
// vectors of pointers.
|
||||
if (AddrReg0.getReg() == AddrReg1.getReg() &&
|
||||
AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
|
||||
int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
|
||||
AMDGPU::OpName::offset);
|
||||
unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
|
||||
unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
|
||||
// This is not a matching DS instruction, but we can keep looking as
|
||||
// long as one of these conditions are met:
|
||||
// 1. It is safe to move I down past MBBI.
|
||||
// 2. It is safe to move MBBI down past the instruction that I will
|
||||
// be merged into.
|
||||
|
||||
// Check both offsets fit in the reduced range.
|
||||
if (offsetsCanBeCombined(Offset0, Offset1, EltSize))
|
||||
return MBBI;
|
||||
if (MBBI->hasUnmodeledSideEffects())
|
||||
// We can't re-order this instruction with respect to other memory
|
||||
// opeations, so we fail both conditions mentioned above.
|
||||
return E;
|
||||
|
||||
if (MBBI->mayLoadOrStore() &&
|
||||
!TII->areMemAccessesTriviallyDisjoint(*I, *MBBI, AA)) {
|
||||
// We fail condition #1, but we may still be able to satisfy condition
|
||||
// #2. Add this instruction to the move list and then we will check
|
||||
// if condition #2 holds once we have selected the matching instruction.
|
||||
InstsToMove.push_back(&*MBBI);
|
||||
addDefsToList(*MBBI, DefsToMove);
|
||||
continue;
|
||||
}
|
||||
|
||||
// When we match I with another DS instruction we will be moving I down
|
||||
// to the location of the matched instruction any uses of I will need to
|
||||
// be moved down as well.
|
||||
for (const MachineOperand *Def : DefsToMove) {
|
||||
bool ReadDef = MBBI->readsVirtualRegister(Def->getReg());
|
||||
// If ReadDef is true, then there is a use of Def between I
|
||||
// and the instruction that I will potentially be merged with. We
|
||||
// will need to move this instruction after the merged instructions.
|
||||
if (ReadDef) {
|
||||
InstsToMove.push_back(&*MBBI);
|
||||
addDefsToList(*MBBI, DefsToMove);
|
||||
break;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Don't merge volatiles.
|
||||
if (MBBI->hasOrderedMemoryRef())
|
||||
return E;
|
||||
|
||||
int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
|
||||
const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
|
||||
const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
|
||||
|
||||
// Check same base pointer. Be careful of subregisters, which can occur with
|
||||
// vectors of pointers.
|
||||
if (AddrReg0.getReg() == AddrReg1.getReg() &&
|
||||
AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
|
||||
int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
|
||||
AMDGPU::OpName::offset);
|
||||
unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
|
||||
unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
|
||||
|
||||
// Check both offsets fit in the reduced range.
|
||||
// We also need to go through the list of instructions that we plan to
|
||||
// move and make sure they are all safe to move down past the merged
|
||||
// instruction.
|
||||
if (offsetsCanBeCombined(Offset0, Offset1, EltSize) &&
|
||||
canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA))
|
||||
return MBBI;
|
||||
}
|
||||
|
||||
// We've found a load/store that we couldn't merge for some reason.
|
||||
// We could potentially keep looking, but we'd need to make sure that
|
||||
// it was safe to move I and also all the instruction in InstsToMove
|
||||
// down past this instruction.
|
||||
// FIXME: This is too conservative.
|
||||
break;
|
||||
}
|
||||
|
||||
return E;
|
||||
}
|
||||
|
||||
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
|
||||
MachineBasicBlock::iterator I,
|
||||
MachineBasicBlock::iterator Paired,
|
||||
unsigned EltSize) {
|
||||
unsigned EltSize,
|
||||
ArrayRef<MachineInstr*> InstsToMove) {
|
||||
MachineBasicBlock *MBB = I->getParent();
|
||||
|
||||
// Be careful, since the addresses could be subregisters themselves in weird
|
||||
@ -247,7 +329,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
|
||||
|
||||
DebugLoc DL = I->getDebugLoc();
|
||||
MachineInstrBuilder Read2
|
||||
= BuildMI(*MBB, I, DL, Read2Desc, DestReg)
|
||||
= BuildMI(*MBB, Paired, DL, Read2Desc, DestReg)
|
||||
.addOperand(*AddrReg) // addr
|
||||
.addImm(NewOffset0) // offset0
|
||||
.addImm(NewOffset1) // offset1
|
||||
@ -258,48 +340,28 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
|
||||
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
|
||||
|
||||
// Copy to the old destination registers.
|
||||
MachineInstr *Copy0 = BuildMI(*MBB, I, DL, CopyDesc)
|
||||
BuildMI(*MBB, Paired, DL, CopyDesc)
|
||||
.addOperand(*Dest0) // Copy to same destination including flags and sub reg.
|
||||
.addReg(DestReg, 0, SubRegIdx0);
|
||||
MachineInstr *Copy1 = BuildMI(*MBB, I, DL, CopyDesc)
|
||||
MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc)
|
||||
.addOperand(*Dest1)
|
||||
.addReg(DestReg, RegState::Kill, SubRegIdx1);
|
||||
|
||||
LIS->InsertMachineInstrInMaps(*Read2);
|
||||
|
||||
// repairLiveintervalsInRange() doesn't handle physical register, so we have
|
||||
// to update the M0 range manually.
|
||||
SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
|
||||
LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
|
||||
LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
|
||||
bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
|
||||
|
||||
// The new write to the original destination register is now the copy. Steal
|
||||
// the old SlotIndex.
|
||||
LIS->ReplaceMachineInstrInMaps(*I, *Copy0);
|
||||
LIS->ReplaceMachineInstrInMaps(*Paired, *Copy1);
|
||||
moveInstsAfter(Copy1, InstsToMove);
|
||||
|
||||
MachineBasicBlock::iterator Next = std::next(I);
|
||||
I->eraseFromParent();
|
||||
Paired->eraseFromParent();
|
||||
|
||||
LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
|
||||
LIS->shrinkToUses(&AddrRegLI);
|
||||
|
||||
LIS->createAndComputeVirtRegInterval(DestReg);
|
||||
|
||||
if (UpdateM0Range) {
|
||||
SlotIndex Read2Index = LIS->getInstructionIndex(*Read2);
|
||||
M0Segment->end = Read2Index.getRegSlot();
|
||||
}
|
||||
|
||||
DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
|
||||
return Read2.getInstr();
|
||||
return Next;
|
||||
}
|
||||
|
||||
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
|
||||
MachineBasicBlock::iterator I,
|
||||
MachineBasicBlock::iterator Paired,
|
||||
unsigned EltSize) {
|
||||
unsigned EltSize,
|
||||
ArrayRef<MachineInstr*> InstsToMove) {
|
||||
MachineBasicBlock *MBB = I->getParent();
|
||||
|
||||
// Be sure to use .addOperand(), and not .addReg() with these. We want to be
|
||||
@ -341,15 +403,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
|
||||
const MCInstrDesc &Write2Desc = TII->get(Opc);
|
||||
DebugLoc DL = I->getDebugLoc();
|
||||
|
||||
// repairLiveintervalsInRange() doesn't handle physical register, so we have
|
||||
// to update the M0 range manually.
|
||||
SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
|
||||
LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
|
||||
LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
|
||||
bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
|
||||
|
||||
MachineInstrBuilder Write2
|
||||
= BuildMI(*MBB, I, DL, Write2Desc)
|
||||
= BuildMI(*MBB, Paired, DL, Write2Desc)
|
||||
.addOperand(*Addr) // addr
|
||||
.addOperand(*Data0) // data0
|
||||
.addOperand(*Data1) // data1
|
||||
@ -359,24 +414,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
|
||||
.addMemOperand(*I->memoperands_begin())
|
||||
.addMemOperand(*Paired->memoperands_begin());
|
||||
|
||||
// XXX - How do we express subregisters here?
|
||||
unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() };
|
||||
moveInstsAfter(Write2, InstsToMove);
|
||||
|
||||
LIS->RemoveMachineInstrFromMaps(*I);
|
||||
LIS->RemoveMachineInstrFromMaps(*Paired);
|
||||
MachineBasicBlock::iterator Next = std::next(I);
|
||||
I->eraseFromParent();
|
||||
Paired->eraseFromParent();
|
||||
|
||||
// This doesn't handle physical registers like M0
|
||||
LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs);
|
||||
|
||||
if (UpdateM0Range) {
|
||||
SlotIndex Write2Index = LIS->getInstructionIndex(*Write2);
|
||||
M0Segment->end = Write2Index.getRegSlot();
|
||||
}
|
||||
|
||||
DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
|
||||
return Write2.getInstr();
|
||||
return Next;
|
||||
}
|
||||
|
||||
// Scan through looking for adjacent LDS operations with constant offsets from
|
||||
@ -394,13 +439,15 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
|
||||
continue;
|
||||
}
|
||||
|
||||
SmallVector<MachineInstr*, 8> InstsToMove;
|
||||
unsigned Opc = MI.getOpcode();
|
||||
if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
|
||||
unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
|
||||
MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
|
||||
MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size,
|
||||
InstsToMove);
|
||||
if (Match != E) {
|
||||
Modified = true;
|
||||
I = mergeRead2Pair(I, Match, Size);
|
||||
I = mergeRead2Pair(I, Match, Size, InstsToMove);
|
||||
} else {
|
||||
++I;
|
||||
}
|
||||
@ -408,10 +455,11 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
|
||||
continue;
|
||||
} else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
|
||||
unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
|
||||
MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
|
||||
MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size,
|
||||
InstsToMove);
|
||||
if (Match != E) {
|
||||
Modified = true;
|
||||
I = mergeWrite2Pair(I, Match, Size);
|
||||
I = mergeWrite2Pair(I, Match, Size, InstsToMove);
|
||||
} else {
|
||||
++I;
|
||||
}
|
||||
@ -437,8 +485,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
|
||||
TRI = &TII->getRegisterInfo();
|
||||
|
||||
MRI = &MF.getRegInfo();
|
||||
|
||||
LIS = &getAnalysis<LiveIntervals>();
|
||||
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
|
||||
|
||||
DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
|
||||
|
||||
|
@ -9,8 +9,8 @@
|
||||
; SI-LABEL: {{^}}offset_order:
|
||||
|
||||
; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
|
||||
; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14
|
||||
; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44
|
||||
; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14
|
||||
; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44
|
||||
|
||||
define void @offset_order(float addrspace(1)* %out) {
|
||||
entry:
|
||||
|
@ -179,8 +179,8 @@ define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float add
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_write2_two_val_f32_x2
|
||||
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL0]] offset1:11
|
||||
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL1:v[0-9]+]], [[VAL1]] offset0:8 offset1:27
|
||||
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
|
||||
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
|
||||
; SI: s_endpgm
|
||||
define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
@ -209,8 +209,8 @@ define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspac
|
||||
}
|
||||
|
||||
; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base
|
||||
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL0]] offset0:3 offset1:11
|
||||
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL1:v[0-9]+]], [[VAL1]] offset0:8 offset1:27
|
||||
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
|
||||
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
|
||||
; SI: s_endpgm
|
||||
define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
|
||||
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
@ -13,8 +13,8 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
|
||||
; CI: v_ceil_f64_e32
|
||||
; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
|
||||
; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
|
||||
; SI-DAG: s_add_i32 [[SEXP1:s[0-9]+]], [[SEXP]], 0xfffffc01
|
||||
; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP1]]
|
||||
; SI-DAG: s_addk_i32 [[SEXP]], 0xfc01
|
||||
; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP]]
|
||||
; SI-DAG: s_not_b64
|
||||
; SI-DAG: s_and_b64
|
||||
; SI-DAG: cmp_gt_i32
|
||||
|
@ -25,7 +25,8 @@ define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
|
||||
; SI: v_rsq_clamp_f64_e32
|
||||
|
||||
; TODO: this constant should be folded:
|
||||
; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], -1
|
||||
; VI-DAG: s_mov_b32 [[NEG1:s[0-9+]]], -1
|
||||
; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], [[NEG1]]
|
||||
; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
|
||||
; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
|
||||
; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}
|
||||
|
@ -73,8 +73,8 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v16i16:
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:3{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
|
||||
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
@ -287,11 +287,9 @@ define void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Should have 2 ds_read_b64
|
||||
; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32:
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
|
||||
; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}}
|
||||
; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:24
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
|
||||
|
||||
; GCN: ds_write2_b64
|
||||
; GCN: ds_write2_b64
|
||||
@ -314,9 +312,9 @@ define void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32:
|
||||
; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}}
|
||||
; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16{{$}}
|
||||
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
@ -379,10 +377,18 @@ define void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32:
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:5{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
@ -407,17 +413,31 @@ define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Missed read2
|
||||
; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32:
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:11 offset1:15
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:14 offset1:15
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
|
||||
; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:64
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:9 offset1:10
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:9
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:12 offset1:13
|
||||
; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:112
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:10 offset1:11
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:26 offset1:27
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:24 offset1:25
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:22 offset1:23
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:20 offset1:21
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:18 offset1:19
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:16 offset1:17
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
|
||||
|
||||
; EG: LDS_READ_RET
|
||||
; EG: LDS_READ_RET
|
||||
|
@ -56,10 +56,14 @@ entry:
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}local_load_v16i32:
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:5 offset1:6{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:7{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
|
||||
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
|
||||
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
|
||||
define void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
|
||||
entry:
|
||||
%ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
|
||||
|
@ -44,8 +44,7 @@ entry:
|
||||
|
||||
; GCN-LABEL: {{^}}local_memory_two_objects:
|
||||
; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
|
||||
; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16
|
||||
; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}}
|
||||
; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
|
||||
|
||||
; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]
|
||||
|
||||
|
@ -156,7 +156,8 @@ define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out,
|
||||
|
||||
; FUNC-LABEL: @reorder_local_offsets
|
||||
; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:100 offset1:102
|
||||
; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
|
||||
; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:100
|
||||
; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
|
||||
; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
|
||||
; CI: buffer_store_dword
|
||||
; CI: s_endpgm
|
||||
|
@ -46,8 +46,7 @@ define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}local_store_v3i64:
|
||||
; GCN: ds_write_b64
|
||||
; GCN: ds_write_b64
|
||||
; GCN: ds_write2_b64
|
||||
; GCN: ds_write_b64
|
||||
define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
|
||||
store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32
|
||||
|
@ -42,13 +42,11 @@ define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, floa
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_use_s_v_s:
|
||||
; GCN: buffer_load_dword [[VA0:v[0-9]+]]
|
||||
; GCN: buffer_load_dword [[VA1:v[0-9]+]]
|
||||
; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
|
||||
; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
|
||||
|
||||
; GCN: buffer_load_dword [[VA0:v[0-9]+]]
|
||||
; GCN-NOT: v_mov_b32
|
||||
; GCN: buffer_load_dword [[VA1:v[0-9]+]]
|
||||
|
||||
; GCN-NOT: v_mov_b32
|
||||
; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||
; GCN-NOT: v_mov_b32
|
||||
|
Loading…
Reference in New Issue
Block a user