1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 02:52:53 +02:00

AMDGPU/SI: Improve SILoadStoreOptimizer and run it before the scheduler

Summary:
The SILoadStoreOptimizer can now look ahead more then one instruction when
looking for instructions to merge, which greatly improves the number of
loads/stores that we are able to merge.

Moving the pass before scheduling avoids increasing register pressure after
the scheduler, so that the scheduler's register pressure estimates will be
more accurate.  It also gives more consistent results, since it is no longer
affected by minor scheduling changes.

Reviewers: arsenm

Subscribers: arsenm, kzhuravl, llvm-commits

Differential Revision: https://reviews.llvm.org/D23814

llvm-svn: 279991
This commit is contained in:
Tom Stellard 2016-08-29 19:15:22 +00:00
parent 2fbd6a61b4
commit 261d5c31cf
12 changed files with 205 additions and 146 deletions

View File

@ -497,6 +497,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
// XXX - Can we get away without running DeadMachineInstructionElim again?
addPass(&SIFoldOperandsID);
addPass(&DeadMachineInstructionElimID);
addPass(&SILoadStoreOptimizerID);
}
void GCNPassConfig::addIRPasses() {
@ -533,17 +534,6 @@ bool GCNPassConfig::addGlobalInstructionSelect() {
#endif
void GCNPassConfig::addPreRegAlloc() {
if (getOptLevel() > CodeGenOpt::None) {
// Don't do this with no optimizations since it throws away debug info by
// merging nonadjacent loads.
// This should be run after scheduling, but before register allocation. It
// also need extra copies to the address operand to be eliminated.
// FIXME: Move pre-RA and remove extra reg coalescer run.
insertPass(&MachineSchedulerID, &SILoadStoreOptimizerID);
insertPass(&MachineSchedulerID, &RegisterCoalescerID);
}
addPass(createSIShrinkInstructionsPass());
addPass(createSIWholeQuadModePass());

View File

@ -60,31 +60,35 @@ private:
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
MachineRegisterInfo *MRI;
LiveIntervals *LIS;
AliasAnalysis *AA;
static bool offsetsCanBeCombined(unsigned Offset0,
unsigned Offset1,
unsigned EltSize);
MachineBasicBlock::iterator findMatchingDSInst(MachineBasicBlock::iterator I,
unsigned EltSize);
MachineBasicBlock::iterator findMatchingDSInst(
MachineBasicBlock::iterator I,
unsigned EltSize,
SmallVectorImpl<MachineInstr*> &InstsToMove);
MachineBasicBlock::iterator mergeRead2Pair(
MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
unsigned EltSize);
unsigned EltSize,
ArrayRef<MachineInstr*> InstsToMove);
MachineBasicBlock::iterator mergeWrite2Pair(
MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
unsigned EltSize);
unsigned EltSize,
ArrayRef<MachineInstr*> InstsToMove);
public:
static char ID;
SILoadStoreOptimizer()
: MachineFunctionPass(ID), TII(nullptr), TRI(nullptr), MRI(nullptr),
LIS(nullptr) {}
AA(nullptr) {}
SILoadStoreOptimizer(const TargetMachine &TM_) : MachineFunctionPass(ID) {
initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
@ -98,17 +102,9 @@ public:
return "SI Load / Store Optimizer";
}
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::NoPHIs);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.setPreservesCFG();
AU.addPreserved<SlotIndexes>();
AU.addPreserved<LiveIntervals>();
AU.addPreserved<LiveVariables>();
AU.addRequired<LiveIntervals>();
AU.addRequired<AAResultsWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@ -118,9 +114,7 @@ public:
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
"SI Load / Store Optimizer", false, false)
INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
INITIALIZE_PASS_DEPENDENCY(LiveVariables)
INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(SILoadStoreOptimizer, DEBUG_TYPE,
"SI Load / Store Optimizer", false, false)
@ -132,6 +126,40 @@ FunctionPass *llvm::createSILoadStoreOptimizerPass(TargetMachine &TM) {
return new SILoadStoreOptimizer(TM);
}
static void moveInstsAfter(MachineBasicBlock::iterator I,
ArrayRef<MachineInstr*> InstsToMove) {
MachineBasicBlock *MBB = I->getParent();
++I;
for (MachineInstr *MI : InstsToMove) {
MI->removeFromParent();
MBB->insert(I, MI);
}
}
static void addDefsToList(const MachineInstr &MI,
SmallVectorImpl<const MachineOperand *> &Defs) {
for (const MachineOperand &Def : MI.defs()) {
Defs.push_back(&Def);
}
}
static bool
canMoveInstsAcrossMemOp(MachineInstr &MemOp,
ArrayRef<MachineInstr*> InstsToMove,
const SIInstrInfo *TII,
AliasAnalysis *AA) {
assert(MemOp.mayLoadOrStore());
for (MachineInstr *InstToMove : InstsToMove) {
if (!InstToMove->mayLoadOrStore())
continue;
if (!TII->areMemAccessesTriviallyDisjoint(MemOp, *InstToMove, AA))
return false;
}
return true;
}
bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
unsigned Offset1,
unsigned Size) {
@ -161,44 +189,98 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(unsigned Offset0,
MachineBasicBlock::iterator
SILoadStoreOptimizer::findMatchingDSInst(MachineBasicBlock::iterator I,
unsigned EltSize){
unsigned EltSize,
SmallVectorImpl<MachineInstr*> &InstsToMove) {
MachineBasicBlock::iterator E = I->getParent()->end();
MachineBasicBlock &MBB = *I->getParent();
MachineBasicBlock::iterator MBBI = I;
++MBBI;
if (MBBI == MBB.end() || MBBI->getOpcode() != I->getOpcode())
return E;
SmallVector<const MachineOperand *, 8> DefsToMove;
addDefsToList(*I, DefsToMove);
// Don't merge volatiles.
if (MBBI->hasOrderedMemoryRef())
return E;
for ( ; MBBI != E; ++MBBI) {
int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
if (MBBI->getOpcode() != I->getOpcode()) {
// Check same base pointer. Be careful of subregisters, which can occur with
// vectors of pointers.
if (AddrReg0.getReg() == AddrReg1.getReg() &&
AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
AMDGPU::OpName::offset);
unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
// This is not a matching DS instruction, but we can keep looking as
// long as one of these conditions are met:
// 1. It is safe to move I down past MBBI.
// 2. It is safe to move MBBI down past the instruction that I will
// be merged into.
// Check both offsets fit in the reduced range.
if (offsetsCanBeCombined(Offset0, Offset1, EltSize))
return MBBI;
if (MBBI->hasUnmodeledSideEffects())
// We can't re-order this instruction with respect to other memory
// opeations, so we fail both conditions mentioned above.
return E;
if (MBBI->mayLoadOrStore() &&
!TII->areMemAccessesTriviallyDisjoint(*I, *MBBI, AA)) {
// We fail condition #1, but we may still be able to satisfy condition
// #2. Add this instruction to the move list and then we will check
// if condition #2 holds once we have selected the matching instruction.
InstsToMove.push_back(&*MBBI);
addDefsToList(*MBBI, DefsToMove);
continue;
}
// When we match I with another DS instruction we will be moving I down
// to the location of the matched instruction any uses of I will need to
// be moved down as well.
for (const MachineOperand *Def : DefsToMove) {
bool ReadDef = MBBI->readsVirtualRegister(Def->getReg());
// If ReadDef is true, then there is a use of Def between I
// and the instruction that I will potentially be merged with. We
// will need to move this instruction after the merged instructions.
if (ReadDef) {
InstsToMove.push_back(&*MBBI);
addDefsToList(*MBBI, DefsToMove);
break;
}
}
continue;
}
// Don't merge volatiles.
if (MBBI->hasOrderedMemoryRef())
return E;
int AddrIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::addr);
const MachineOperand &AddrReg0 = I->getOperand(AddrIdx);
const MachineOperand &AddrReg1 = MBBI->getOperand(AddrIdx);
// Check same base pointer. Be careful of subregisters, which can occur with
// vectors of pointers.
if (AddrReg0.getReg() == AddrReg1.getReg() &&
AddrReg0.getSubReg() == AddrReg1.getSubReg()) {
int OffsetIdx = AMDGPU::getNamedOperandIdx(I->getOpcode(),
AMDGPU::OpName::offset);
unsigned Offset0 = I->getOperand(OffsetIdx).getImm() & 0xffff;
unsigned Offset1 = MBBI->getOperand(OffsetIdx).getImm() & 0xffff;
// Check both offsets fit in the reduced range.
// We also need to go through the list of instructions that we plan to
// move and make sure they are all safe to move down past the merged
// instruction.
if (offsetsCanBeCombined(Offset0, Offset1, EltSize) &&
canMoveInstsAcrossMemOp(*MBBI, InstsToMove, TII, AA))
return MBBI;
}
// We've found a load/store that we couldn't merge for some reason.
// We could potentially keep looking, but we'd need to make sure that
// it was safe to move I and also all the instruction in InstsToMove
// down past this instruction.
// FIXME: This is too conservative.
break;
}
return E;
}
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
unsigned EltSize) {
unsigned EltSize,
ArrayRef<MachineInstr*> InstsToMove) {
MachineBasicBlock *MBB = I->getParent();
// Be careful, since the addresses could be subregisters themselves in weird
@ -247,7 +329,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
DebugLoc DL = I->getDebugLoc();
MachineInstrBuilder Read2
= BuildMI(*MBB, I, DL, Read2Desc, DestReg)
= BuildMI(*MBB, Paired, DL, Read2Desc, DestReg)
.addOperand(*AddrReg) // addr
.addImm(NewOffset0) // offset0
.addImm(NewOffset1) // offset1
@ -258,48 +340,28 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeRead2Pair(
const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
// Copy to the old destination registers.
MachineInstr *Copy0 = BuildMI(*MBB, I, DL, CopyDesc)
BuildMI(*MBB, Paired, DL, CopyDesc)
.addOperand(*Dest0) // Copy to same destination including flags and sub reg.
.addReg(DestReg, 0, SubRegIdx0);
MachineInstr *Copy1 = BuildMI(*MBB, I, DL, CopyDesc)
MachineInstr *Copy1 = BuildMI(*MBB, Paired, DL, CopyDesc)
.addOperand(*Dest1)
.addReg(DestReg, RegState::Kill, SubRegIdx1);
LIS->InsertMachineInstrInMaps(*Read2);
// repairLiveintervalsInRange() doesn't handle physical register, so we have
// to update the M0 range manually.
SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
// The new write to the original destination register is now the copy. Steal
// the old SlotIndex.
LIS->ReplaceMachineInstrInMaps(*I, *Copy0);
LIS->ReplaceMachineInstrInMaps(*Paired, *Copy1);
moveInstsAfter(Copy1, InstsToMove);
MachineBasicBlock::iterator Next = std::next(I);
I->eraseFromParent();
Paired->eraseFromParent();
LiveInterval &AddrRegLI = LIS->getInterval(AddrReg->getReg());
LIS->shrinkToUses(&AddrRegLI);
LIS->createAndComputeVirtRegInterval(DestReg);
if (UpdateM0Range) {
SlotIndex Read2Index = LIS->getInstructionIndex(*Read2);
M0Segment->end = Read2Index.getRegSlot();
}
DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
return Read2.getInstr();
return Next;
}
MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
unsigned EltSize) {
unsigned EltSize,
ArrayRef<MachineInstr*> InstsToMove) {
MachineBasicBlock *MBB = I->getParent();
// Be sure to use .addOperand(), and not .addReg() with these. We want to be
@ -341,15 +403,8 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
const MCInstrDesc &Write2Desc = TII->get(Opc);
DebugLoc DL = I->getDebugLoc();
// repairLiveintervalsInRange() doesn't handle physical register, so we have
// to update the M0 range manually.
SlotIndex PairedIndex = LIS->getInstructionIndex(*Paired);
LiveRange &M0Range = LIS->getRegUnit(*MCRegUnitIterator(AMDGPU::M0, TRI));
LiveRange::Segment *M0Segment = M0Range.getSegmentContaining(PairedIndex);
bool UpdateM0Range = M0Segment->end == PairedIndex.getRegSlot();
MachineInstrBuilder Write2
= BuildMI(*MBB, I, DL, Write2Desc)
= BuildMI(*MBB, Paired, DL, Write2Desc)
.addOperand(*Addr) // addr
.addOperand(*Data0) // data0
.addOperand(*Data1) // data1
@ -359,24 +414,14 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeWrite2Pair(
.addMemOperand(*I->memoperands_begin())
.addMemOperand(*Paired->memoperands_begin());
// XXX - How do we express subregisters here?
unsigned OrigRegs[] = { Data0->getReg(), Data1->getReg(), Addr->getReg() };
moveInstsAfter(Write2, InstsToMove);
LIS->RemoveMachineInstrFromMaps(*I);
LIS->RemoveMachineInstrFromMaps(*Paired);
MachineBasicBlock::iterator Next = std::next(I);
I->eraseFromParent();
Paired->eraseFromParent();
// This doesn't handle physical registers like M0
LIS->repairIntervalsInRange(MBB, Write2, Write2, OrigRegs);
if (UpdateM0Range) {
SlotIndex Write2Index = LIS->getInstructionIndex(*Write2);
M0Segment->end = Write2Index.getRegSlot();
}
DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
return Write2.getInstr();
return Next;
}
// Scan through looking for adjacent LDS operations with constant offsets from
@ -394,13 +439,15 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
continue;
}
SmallVector<MachineInstr*, 8> InstsToMove;
unsigned Opc = MI.getOpcode();
if (Opc == AMDGPU::DS_READ_B32 || Opc == AMDGPU::DS_READ_B64) {
unsigned Size = (Opc == AMDGPU::DS_READ_B64) ? 8 : 4;
MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size,
InstsToMove);
if (Match != E) {
Modified = true;
I = mergeRead2Pair(I, Match, Size);
I = mergeRead2Pair(I, Match, Size, InstsToMove);
} else {
++I;
}
@ -408,10 +455,11 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
continue;
} else if (Opc == AMDGPU::DS_WRITE_B32 || Opc == AMDGPU::DS_WRITE_B64) {
unsigned Size = (Opc == AMDGPU::DS_WRITE_B64) ? 8 : 4;
MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size);
MachineBasicBlock::iterator Match = findMatchingDSInst(I, Size,
InstsToMove);
if (Match != E) {
Modified = true;
I = mergeWrite2Pair(I, Match, Size);
I = mergeWrite2Pair(I, Match, Size, InstsToMove);
} else {
++I;
}
@ -437,8 +485,7 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
LIS = &getAnalysis<LiveIntervals>();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");

View File

@ -9,8 +9,8 @@
; SI-LABEL: {{^}}offset_order:
; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
; SI: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14
; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44
; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:12 offset1:14
; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:44
define void @offset_order(float addrspace(1)* %out) {
entry:

View File

@ -179,8 +179,8 @@ define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float add
}
; SI-LABEL: @simple_write2_two_val_f32_x2
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL0]] offset1:11
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL1:v[0-9]+]], [[VAL1]] offset0:8 offset1:27
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
; SI: s_endpgm
define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
@ -209,8 +209,8 @@ define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspac
}
; SI-LABEL: @simple_write2_two_val_f32_x2_nonzero_base
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL0]] offset0:3 offset1:11
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL1:v[0-9]+]], [[VAL1]] offset0:8 offset1:27
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
; SI: s_endpgm
define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1

View File

@ -13,8 +13,8 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
; CI: v_ceil_f64_e32
; SI: s_bfe_u32 [[SEXP:s[0-9]+]], {{s[0-9]+}}, 0xb0014
; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
; SI-DAG: s_add_i32 [[SEXP1:s[0-9]+]], [[SEXP]], 0xfffffc01
; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP1]]
; SI-DAG: s_addk_i32 [[SEXP]], 0xfc01
; SI-DAG: s_lshr_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], [[SEXP]]
; SI-DAG: s_not_b64
; SI-DAG: s_and_b64
; SI-DAG: cmp_gt_i32

View File

@ -25,7 +25,8 @@ define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
; SI: v_rsq_clamp_f64_e32
; TODO: this constant should be folded:
; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], -1
; VI-DAG: s_mov_b32 [[NEG1:s[0-9+]]], -1
; VI-DAG: s_mov_b32 s[[LOW1:[0-9+]]], [[NEG1]]
; VI-DAG: s_mov_b32 s[[HIGH1:[0-9+]]], 0x7fefffff
; VI-DAG: s_mov_b32 s[[HIGH2:[0-9+]]], 0xffefffff
; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}

View File

@ -73,8 +73,8 @@ entry:
}
; FUNC-LABEL: {{^}}local_load_v16i16:
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:3{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
; EG: LDS_READ_RET
@ -287,11 +287,9 @@ define void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1
ret void
}
; FIXME: Should have 2 ds_read_b64
; FUNC-LABEL: {{^}}local_zextload_v16i16_to_v16i32:
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}}
; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:24
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
; GCN: ds_write2_b64
; GCN: ds_write2_b64
@ -314,9 +312,9 @@ define void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16
}
; FUNC-LABEL: {{^}}local_sextload_v16i16_to_v16i32:
; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:3{{$}}
; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:16{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
; EG: LDS_READ_RET
; EG: LDS_READ_RET
@ -379,10 +377,18 @@ define void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32
}
; FUNC-LABEL: {{^}}local_sextload_v32i16_to_v32i32:
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:5{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
; EG: LDS_READ_RET
; EG: LDS_READ_RET
@ -407,17 +413,31 @@ define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32
ret void
}
; FIXME: Missed read2
; FUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i32:
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:11 offset1:15
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:14 offset1:15
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7
; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:64
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:9 offset1:10
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:9
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:12 offset1:13
; GCN-DAG: ds_read_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:112
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:10 offset1:11
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:26 offset1:27
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:24 offset1:25
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:22 offset1:23
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:20 offset1:21
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:18 offset1:19
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:16 offset1:17
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:12 offset1:13
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:10 offset1:11
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:8 offset1:9
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
; EG: LDS_READ_RET
; EG: LDS_READ_RET

View File

@ -56,10 +56,14 @@ entry:
}
; FUNC-LABEL: {{^}}local_load_v16i32:
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:3 offset1:4{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:5 offset1:6{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:7{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:1 offset1:2{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:6 offset1:7{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:4 offset1:5{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:6 offset1:7
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
define void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
entry:
%ld = load <16 x i32>, <16 x i32> addrspace(3)* %in

View File

@ -44,8 +44,7 @@ entry:
; GCN-LABEL: {{^}}local_memory_two_objects:
; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*}} offset:16
; CI-DAG: ds_write_b32 [[ADDRW]], {{v[0-9]*$}}
; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]

View File

@ -156,7 +156,8 @@ define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out,
; FUNC-LABEL: @reorder_local_offsets
; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:100 offset1:102
; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:400
; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:100
; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
; CI: buffer_store_dword
; CI: s_endpgm

View File

@ -46,8 +46,7 @@ define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64
}
; GCN-LABEL: {{^}}local_store_v3i64:
; GCN: ds_write_b64
; GCN: ds_write_b64
; GCN: ds_write2_b64
; GCN: ds_write_b64
define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32

View File

@ -42,13 +42,11 @@ define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, floa
}
; GCN-LABEL: {{^}}test_use_s_v_s:
; GCN: buffer_load_dword [[VA0:v[0-9]+]]
; GCN: buffer_load_dword [[VA1:v[0-9]+]]
; GCN-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
; GCN-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
; GCN: buffer_load_dword [[VA0:v[0-9]+]]
; GCN-NOT: v_mov_b32
; GCN: buffer_load_dword [[VA1:v[0-9]+]]
; GCN-NOT: v_mov_b32
; GCN: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; GCN-NOT: v_mov_b32