mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-24 19:52:54 +01:00
[AMDGPU] Rename amdgcn_wwm to amdgcn_strict_wwm
* Introduce the new intrinsic amdgcn_strict_wwm * Deprecate the old intrinsic amdgcn_wwm The change is done for consistency as the "strict" prefix will become an important, distinguishing factor between amdgcn_wqm and amdgcn_strictwqm in the future. The "strict" prefix indicates that inactive lanes do not take part in control flow, specifically an inactive lane enabled by a strict mode will always be enabled irrespective of control flow decisions. The amdgcn_wwm will be removed, but doing so in two steps gives users time to switch to the new name at their own pace. Reviewed By: critson Differential Revision: https://reviews.llvm.org/D96257
This commit is contained in:
parent
c2913dde5c
commit
97e89dc154
@ -1610,8 +1610,13 @@ def int_amdgcn_wqm_demote : Intrinsic<[],
|
||||
// Copies the active channels of the source value to the destination value,
|
||||
// with the guarantee that the source value is computed as if the entire
|
||||
// program were executed in Whole Wavefront Mode, i.e. with all channels
|
||||
// enabled, with a few exceptions: - Phi nodes with require WWM return an
|
||||
// enabled, with a few exceptions: - Phi nodes which require WWM return an
|
||||
// undefined value.
|
||||
def int_amdgcn_strict_wwm : Intrinsic<[llvm_any_ty],
|
||||
[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
|
||||
IntrConvergent, IntrWillReturn]
|
||||
>;
|
||||
// Deprecated. Use int_amdgcn_strict_wwm instead.
|
||||
def int_amdgcn_wwm : Intrinsic<[llvm_any_ty],
|
||||
[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable,
|
||||
IntrConvergent, IntrWillReturn]
|
||||
|
@ -517,7 +517,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
|
||||
}
|
||||
|
||||
// Finally mark the readlanes in the WWM section.
|
||||
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, NewV);
|
||||
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
|
||||
} else {
|
||||
switch (Op) {
|
||||
default:
|
||||
@ -621,7 +621,8 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
|
||||
// from the first lane, to get our lane's index into the atomic result.
|
||||
Value *LaneOffset = nullptr;
|
||||
if (ValDivergent) {
|
||||
LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
|
||||
LaneOffset =
|
||||
B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
|
||||
} else {
|
||||
switch (Op) {
|
||||
default:
|
||||
|
@ -2642,7 +2642,8 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
|
||||
Opcode = AMDGPU::SOFT_WQM;
|
||||
break;
|
||||
case Intrinsic::amdgcn_wwm:
|
||||
Opcode = AMDGPU::WWM;
|
||||
case Intrinsic::amdgcn_strict_wwm:
|
||||
Opcode = AMDGPU::STRICT_WWM;
|
||||
break;
|
||||
case Intrinsic::amdgcn_interp_p1_f16:
|
||||
SelectInterpP1F16(N);
|
||||
|
@ -927,8 +927,9 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
|
||||
return constrainCopyLikeIntrin(I, AMDGPU::WQM);
|
||||
case Intrinsic::amdgcn_softwqm:
|
||||
return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
|
||||
case Intrinsic::amdgcn_strict_wwm:
|
||||
case Intrinsic::amdgcn_wwm:
|
||||
return constrainCopyLikeIntrin(I, AMDGPU::WWM);
|
||||
return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
|
||||
case Intrinsic::amdgcn_writelane:
|
||||
return selectWritelane(I);
|
||||
case Intrinsic::amdgcn_div_scale:
|
||||
|
@ -3956,6 +3956,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
|
||||
case Intrinsic::amdgcn_update_dpp:
|
||||
case Intrinsic::amdgcn_mov_dpp8:
|
||||
case Intrinsic::amdgcn_mov_dpp:
|
||||
case Intrinsic::amdgcn_strict_wwm:
|
||||
case Intrinsic::amdgcn_wwm:
|
||||
case Intrinsic::amdgcn_wqm:
|
||||
case Intrinsic::amdgcn_softwqm:
|
||||
|
@ -582,7 +582,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
|
||||
case AMDGPU::COPY:
|
||||
case AMDGPU::WQM:
|
||||
case AMDGPU::SOFT_WQM:
|
||||
case AMDGPU::WWM: {
|
||||
case AMDGPU::STRICT_WWM: {
|
||||
Register DstReg = MI.getOperand(0).getReg();
|
||||
|
||||
const TargetRegisterClass *SrcRC, *DstRC;
|
||||
|
@ -1942,16 +1942,16 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
|
||||
MI.eraseFromParent();
|
||||
break;
|
||||
}
|
||||
case AMDGPU::ENTER_WWM: {
|
||||
case AMDGPU::ENTER_STRICT_WWM: {
|
||||
// This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
|
||||
// WWM is entered.
|
||||
// Whole Wave Mode is entered.
|
||||
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
|
||||
: AMDGPU::S_OR_SAVEEXEC_B64));
|
||||
break;
|
||||
}
|
||||
case AMDGPU::EXIT_WWM: {
|
||||
case AMDGPU::EXIT_STRICT_WWM: {
|
||||
// This only gets its own opcode so that SIPreAllocateWWMRegs can tell when
|
||||
// WWM is exited.
|
||||
// Whole Wave Mode is exited.
|
||||
MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
|
||||
break;
|
||||
}
|
||||
@ -4406,7 +4406,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
|
||||
case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
|
||||
case AMDGPU::WQM: return AMDGPU::WQM;
|
||||
case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
|
||||
case AMDGPU::WWM: return AMDGPU::WWM;
|
||||
case AMDGPU::STRICT_WWM: return AMDGPU::STRICT_WWM;
|
||||
case AMDGPU::S_MOV_B32: {
|
||||
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
|
||||
return MI.getOperand(1).isReg() ||
|
||||
@ -6642,7 +6642,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
|
||||
case AMDGPU::COPY:
|
||||
case AMDGPU::WQM:
|
||||
case AMDGPU::SOFT_WQM:
|
||||
case AMDGPU::WWM:
|
||||
case AMDGPU::STRICT_WWM:
|
||||
case AMDGPU::REG_SEQUENCE:
|
||||
case AMDGPU::PHI:
|
||||
case AMDGPU::INSERT_SUBREG:
|
||||
@ -6800,7 +6800,7 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
|
||||
case AMDGPU::INSERT_SUBREG:
|
||||
case AMDGPU::WQM:
|
||||
case AMDGPU::SOFT_WQM:
|
||||
case AMDGPU::WWM: {
|
||||
case AMDGPU::STRICT_WWM: {
|
||||
const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
|
||||
if (RI.hasAGPRs(SrcRC)) {
|
||||
if (RI.hasAGPRs(NewDstRC))
|
||||
|
@ -119,17 +119,17 @@ def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
|
||||
// turned into a copy by WQM pass, but does not seed WQM requirements.
|
||||
def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
|
||||
|
||||
// Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so
|
||||
// Pseudoinstruction for @llvm.amdgcn.strict.wwm. It is turned into a copy post-RA, so
|
||||
// that the @earlyclobber is respected. The @earlyclobber is to make sure that
|
||||
// the instruction that defines $src0 (which is run in WWM) doesn't
|
||||
// the instruction that defines $src0 (which is run in Whole Wave Mode) doesn't
|
||||
// accidentally clobber inactive channels of $vdst.
|
||||
let Constraints = "@earlyclobber $vdst" in {
|
||||
def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
|
||||
def STRICT_WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
|
||||
}
|
||||
|
||||
} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
|
||||
|
||||
def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
|
||||
def ENTER_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
|
||||
let Uses = [EXEC];
|
||||
let Defs = [EXEC, SCC];
|
||||
let hasSideEffects = 0;
|
||||
@ -137,7 +137,7 @@ def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
|
||||
let mayStore = 0;
|
||||
}
|
||||
|
||||
def EXIT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
|
||||
def EXIT_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
|
||||
let hasSideEffects = 0;
|
||||
let mayLoad = 0;
|
||||
let mayStore = 0;
|
||||
|
@ -185,13 +185,13 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
|
||||
MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64)
|
||||
RegsAssigned |= processDef(MI.getOperand(0));
|
||||
|
||||
if (MI.getOpcode() == AMDGPU::ENTER_WWM) {
|
||||
if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM) {
|
||||
LLVM_DEBUG(dbgs() << "entering WWM region: " << MI << "\n");
|
||||
InWWM = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (MI.getOpcode() == AMDGPU::EXIT_WWM) {
|
||||
if (MI.getOpcode() == AMDGPU::EXIT_STRICT_WWM) {
|
||||
LLVM_DEBUG(dbgs() << "exiting WWM region: " << MI << "\n");
|
||||
InWWM = false;
|
||||
}
|
||||
|
@ -27,7 +27,7 @@
|
||||
/// S_MOV_B64 EXEC, Tmp
|
||||
///
|
||||
/// We also compute when a sequence of instructions requires Whole Wavefront
|
||||
/// Mode (WWM) and insert instructions to save and restore it:
|
||||
/// Mode (StrictWWM) and insert instructions to save and restore it:
|
||||
///
|
||||
/// S_OR_SAVEEXEC_B64 Tmp, -1
|
||||
/// ...
|
||||
@ -76,7 +76,7 @@ namespace {
|
||||
|
||||
enum {
|
||||
StateWQM = 0x1,
|
||||
StateWWM = 0x2,
|
||||
StateStrictWWM = 0x2,
|
||||
StateExact = 0x4,
|
||||
};
|
||||
|
||||
@ -91,13 +91,13 @@ public:
|
||||
static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
|
||||
if (PS.State & StateWQM)
|
||||
OS << "WQM";
|
||||
if (PS.State & StateWWM) {
|
||||
if (PS.State & StateStrictWWM) {
|
||||
if (PS.State & StateWQM)
|
||||
OS << '|';
|
||||
OS << "WWM";
|
||||
OS << "StrictWWM";
|
||||
}
|
||||
if (PS.State & StateExact) {
|
||||
if (PS.State & (StateWQM | StateWWM))
|
||||
if (PS.State & (StateWQM | StateStrictWWM))
|
||||
OS << '|';
|
||||
OS << "Exact";
|
||||
}
|
||||
@ -151,7 +151,7 @@ private:
|
||||
DenseMap<const MachineInstr *, InstrInfo> Instructions;
|
||||
MapVector<MachineBasicBlock *, BlockInfo> Blocks;
|
||||
|
||||
// Tracks state (WQM/WWM/Exact) after a given instruction
|
||||
// Tracks state (WQM/StrictWWM/Exact) after a given instruction
|
||||
DenseMap<const MachineInstr *, char> StateTransition;
|
||||
|
||||
SmallVector<MachineInstr *, 2> LiveMaskQueries;
|
||||
@ -184,10 +184,10 @@ private:
|
||||
Register SaveWQM);
|
||||
void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
|
||||
Register SavedWQM);
|
||||
void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
|
||||
Register SaveOrig);
|
||||
void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
|
||||
Register SavedOrig, char NonWWMState);
|
||||
void toStrictWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
|
||||
Register SaveOrig);
|
||||
void fromStrictWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
|
||||
Register SavedOrig, char NonStrictWWMState);
|
||||
|
||||
MachineBasicBlock *splitBlock(MachineBasicBlock *BB, MachineInstr *TermMI);
|
||||
|
||||
@ -465,23 +465,23 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
|
||||
LowerToCopyInstrs.push_back(&MI);
|
||||
SoftWQMInstrs.push_back(&MI);
|
||||
continue;
|
||||
} else if (Opcode == AMDGPU::WWM) {
|
||||
// The WWM intrinsic doesn't make the same guarantee, and plus it needs
|
||||
// to be executed in WQM or Exact so that its copy doesn't clobber
|
||||
// inactive lanes.
|
||||
markInstructionUses(MI, StateWWM, Worklist);
|
||||
GlobalFlags |= StateWWM;
|
||||
} else if (Opcode == AMDGPU::STRICT_WWM) {
|
||||
// The STRICT_WWM intrinsic doesn't make the same guarantee, and plus
|
||||
// it needs to be executed in WQM or Exact so that its copy doesn't
|
||||
// clobber inactive lanes.
|
||||
markInstructionUses(MI, StateStrictWWM, Worklist);
|
||||
GlobalFlags |= StateStrictWWM;
|
||||
LowerToMovInstrs.push_back(&MI);
|
||||
continue;
|
||||
} else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
|
||||
Opcode == AMDGPU::V_SET_INACTIVE_B64) {
|
||||
III.Disabled = StateWWM;
|
||||
III.Disabled = StateStrictWWM;
|
||||
MachineOperand &Inactive = MI.getOperand(2);
|
||||
if (Inactive.isReg()) {
|
||||
if (Inactive.isUndef()) {
|
||||
LowerToCopyInstrs.push_back(&MI);
|
||||
} else {
|
||||
markOperand(MI, Inactive, StateWWM, Worklist);
|
||||
markOperand(MI, Inactive, StateStrictWWM, Worklist);
|
||||
}
|
||||
}
|
||||
SetInactiveInstrs.push_back(&MI);
|
||||
@ -493,7 +493,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
|
||||
Worklist.push_back(&MBB);
|
||||
}
|
||||
GlobalFlags |= StateExact;
|
||||
III.Disabled = StateWQM | StateWWM;
|
||||
III.Disabled = StateWQM | StateStrictWWM;
|
||||
continue;
|
||||
} else {
|
||||
if (Opcode == AMDGPU::SI_PS_LIVE || Opcode == AMDGPU::SI_LIVE_MASK) {
|
||||
@ -570,7 +570,7 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
|
||||
|
||||
// Propagate backwards within block
|
||||
if (MachineInstr *PrevMI = MI.getPrevNode()) {
|
||||
char InNeeds = (II.Needs & ~StateWWM) | II.OutNeeds;
|
||||
char InNeeds = (II.Needs & ~StateStrictWWM) | II.OutNeeds;
|
||||
if (!PrevMI->isPHI()) {
|
||||
InstrInfo &PrevII = Instructions[PrevMI];
|
||||
if ((PrevII.OutNeeds | InNeeds) != PrevII.OutNeeds) {
|
||||
@ -586,10 +586,10 @@ void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
|
||||
if (II.Needs != 0)
|
||||
markInstructionUses(MI, II.Needs, Worklist);
|
||||
|
||||
// Ensure we process a block containing WWM, even if it does not require any
|
||||
// WQM transitions.
|
||||
if (II.Needs & StateWWM)
|
||||
BI.Needs |= StateWWM;
|
||||
// Ensure we process a block containing StrictWWM, even if it does not require
|
||||
// any WQM transitions.
|
||||
if (II.Needs & StateStrictWWM)
|
||||
BI.Needs |= StateStrictWWM;
|
||||
}
|
||||
|
||||
void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
|
||||
@ -947,7 +947,7 @@ MachineInstr *SIWholeQuadMode::lowerKillI1(MachineBasicBlock &MBB,
|
||||
|
||||
// Replace (or supplement) instructions accessing live mask.
|
||||
// This can only happen once all the live mask registers have been created
|
||||
// and the execute state (WQM/WWM/Exact) of instructions is known.
|
||||
// and the execute state (WQM/StrictWWM/Exact) of instructions is known.
|
||||
void SIWholeQuadMode::lowerBlock(MachineBasicBlock &MBB) {
|
||||
auto BII = Blocks.find(&MBB);
|
||||
if (BII == Blocks.end())
|
||||
@ -1105,28 +1105,30 @@ void SIWholeQuadMode::toWQM(MachineBasicBlock &MBB,
|
||||
StateTransition[MI] = StateWQM;
|
||||
}
|
||||
|
||||
void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator Before,
|
||||
Register SaveOrig) {
|
||||
void SIWholeQuadMode::toStrictWWM(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator Before,
|
||||
Register SaveOrig) {
|
||||
MachineInstr *MI;
|
||||
|
||||
assert(SaveOrig);
|
||||
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_WWM), SaveOrig)
|
||||
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::ENTER_STRICT_WWM),
|
||||
SaveOrig)
|
||||
.addImm(-1);
|
||||
LIS->InsertMachineInstrInMaps(*MI);
|
||||
StateTransition[MI] = StateWWM;
|
||||
StateTransition[MI] = StateStrictWWM;
|
||||
}
|
||||
|
||||
void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator Before,
|
||||
Register SavedOrig, char NonWWMState) {
|
||||
void SIWholeQuadMode::fromStrictWWM(MachineBasicBlock &MBB,
|
||||
MachineBasicBlock::iterator Before,
|
||||
Register SavedOrig,
|
||||
char NonStrictWWMState) {
|
||||
MachineInstr *MI;
|
||||
|
||||
assert(SavedOrig);
|
||||
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), Exec)
|
||||
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_STRICT_WWM), Exec)
|
||||
.addReg(SavedOrig);
|
||||
LIS->InsertMachineInstrInMaps(*MI);
|
||||
StateTransition[MI] = NonWWMState;
|
||||
StateTransition[MI] = NonStrictWWMState;
|
||||
}
|
||||
|
||||
void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
|
||||
@ -1147,10 +1149,10 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
|
||||
<< ":\n");
|
||||
|
||||
Register SavedWQMReg;
|
||||
Register SavedNonWWMReg;
|
||||
Register SavedNonStrictWWMReg;
|
||||
bool WQMFromExec = IsEntry;
|
||||
char State = (IsEntry || !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
|
||||
char NonWWMState = 0;
|
||||
char NonStrictWWMState = 0;
|
||||
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
|
||||
|
||||
auto II = MBB.getFirstNonPHI(), IE = MBB.end();
|
||||
@ -1164,25 +1166,25 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
|
||||
// Exact or vice versa.
|
||||
MachineBasicBlock::iterator FirstWQM = IE;
|
||||
|
||||
// This stores the first instruction where it's safe to switch from WWM to
|
||||
// Exact/WQM or to switch to WWM. It must always be the same as, or after,
|
||||
// FirstWQM since if it's safe to switch to/from WWM, it must be safe to
|
||||
// switch to/from WQM as well.
|
||||
MachineBasicBlock::iterator FirstWWM = IE;
|
||||
// This stores the first instruction where it's safe to switch from StrictWWM
|
||||
// to Exact/WQM or to switch to StrictWWM. It must always be the same as, or
|
||||
// after, FirstWQM since if it's safe to switch to/from StrictWWM, it must be
|
||||
// safe to switch to/from WQM as well.
|
||||
MachineBasicBlock::iterator FirstStrictWWM = IE;
|
||||
|
||||
// Record initial state is block information.
|
||||
BI.InitialState = State;
|
||||
|
||||
for (;;) {
|
||||
MachineBasicBlock::iterator Next = II;
|
||||
char Needs = StateExact | StateWQM; // WWM is disabled by default
|
||||
char Needs = StateExact | StateWQM; // StrictWWM is disabled by default
|
||||
char OutNeeds = 0;
|
||||
|
||||
if (FirstWQM == IE)
|
||||
FirstWQM = II;
|
||||
|
||||
if (FirstWWM == IE)
|
||||
FirstWWM = II;
|
||||
if (FirstStrictWWM == IE)
|
||||
FirstStrictWWM = II;
|
||||
|
||||
// First, figure out the allowed states (Needs) based on the propagated
|
||||
// flags.
|
||||
@ -1192,8 +1194,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
|
||||
if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
|
||||
auto III = Instructions.find(&MI);
|
||||
if (III != Instructions.end()) {
|
||||
if (III->second.Needs & StateWWM)
|
||||
Needs = StateWWM;
|
||||
if (III->second.Needs & StateStrictWWM)
|
||||
Needs = StateStrictWWM;
|
||||
else if (III->second.Needs & StateWQM)
|
||||
Needs = StateWQM;
|
||||
else
|
||||
@ -1202,8 +1204,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
|
||||
}
|
||||
} else {
|
||||
// If the instruction doesn't actually need a correct EXEC, then we can
|
||||
// safely leave WWM enabled.
|
||||
Needs = StateExact | StateWQM | StateWWM;
|
||||
// safely leave StrictWWM enabled.
|
||||
Needs = StateExact | StateWQM | StateStrictWWM;
|
||||
}
|
||||
|
||||
if (MI.isTerminator() && OutNeeds == StateExact)
|
||||
@ -1223,9 +1225,9 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
|
||||
// Now, transition if necessary.
|
||||
if (!(Needs & State)) {
|
||||
MachineBasicBlock::iterator First;
|
||||
if (State == StateWWM || Needs == StateWWM) {
|
||||
// We must switch to or from WWM
|
||||
First = FirstWWM;
|
||||
if (State == StateStrictWWM || Needs == StateStrictWWM) {
|
||||
// We must switch to or from StrictWWM
|
||||
First = FirstStrictWWM;
|
||||
} else {
|
||||
// We only need to switch to/from WQM, so we can use FirstWQM
|
||||
First = FirstWQM;
|
||||
@ -1235,11 +1237,12 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
|
||||
bool SaveSCC = false;
|
||||
switch (State) {
|
||||
case StateExact:
|
||||
case StateWWM:
|
||||
case StateStrictWWM:
|
||||
// Exact/WWM -> WWM: save SCC
|
||||
// Exact/WWM -> WQM: save SCC if WQM mask is generated from exec
|
||||
// Exact/WWM -> Exact: no save
|
||||
SaveSCC = (Needs & StateWWM) || ((Needs & StateWQM) && WQMFromExec);
|
||||
SaveSCC =
|
||||
(Needs & StateStrictWWM) || ((Needs & StateWQM) && WQMFromExec);
|
||||
break;
|
||||
case StateWQM:
|
||||
// WQM -> Exact/WMM: save SCC
|
||||
@ -1252,20 +1255,20 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
|
||||
MachineBasicBlock::iterator Before =
|
||||
prepareInsertion(MBB, First, II, Needs == StateWQM, SaveSCC);
|
||||
|
||||
if (State == StateWWM) {
|
||||
assert(SavedNonWWMReg);
|
||||
fromWWM(MBB, Before, SavedNonWWMReg, NonWWMState);
|
||||
LIS->createAndComputeVirtRegInterval(SavedNonWWMReg);
|
||||
SavedNonWWMReg = 0;
|
||||
State = NonWWMState;
|
||||
if (State == StateStrictWWM) {
|
||||
assert(SavedNonStrictWWMReg);
|
||||
fromStrictWWM(MBB, Before, SavedNonStrictWWMReg, NonStrictWWMState);
|
||||
LIS->createAndComputeVirtRegInterval(SavedNonStrictWWMReg);
|
||||
SavedNonStrictWWMReg = 0;
|
||||
State = NonStrictWWMState;
|
||||
}
|
||||
|
||||
if (Needs == StateWWM) {
|
||||
NonWWMState = State;
|
||||
assert(!SavedNonWWMReg);
|
||||
SavedNonWWMReg = MRI->createVirtualRegister(BoolRC);
|
||||
toWWM(MBB, Before, SavedNonWWMReg);
|
||||
State = StateWWM;
|
||||
if (Needs == StateStrictWWM) {
|
||||
NonStrictWWMState = State;
|
||||
assert(!SavedNonStrictWWMReg);
|
||||
SavedNonStrictWWMReg = MRI->createVirtualRegister(BoolRC);
|
||||
toStrictWWM(MBB, Before, SavedNonStrictWWMReg);
|
||||
State = StateStrictWWM;
|
||||
} else {
|
||||
if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
|
||||
if (!WQMFromExec && (OutNeeds & StateWQM)) {
|
||||
@ -1287,17 +1290,18 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
|
||||
}
|
||||
State = StateWQM;
|
||||
} else {
|
||||
// We can get here if we transitioned from WWM to a non-WWM state that
|
||||
// already matches our needs, but we shouldn't need to do anything.
|
||||
// We can get here if we transitioned from StrictWWM to a
|
||||
// non-StrictWWM state that already matches our needs, but we
|
||||
// shouldn't need to do anything.
|
||||
assert(Needs & State);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (Needs != (StateExact | StateWQM | StateWWM)) {
|
||||
if (Needs != (StateExact | StateWQM | StateStrictWWM)) {
|
||||
if (Needs != (StateExact | StateWQM))
|
||||
FirstWQM = IE;
|
||||
FirstWWM = IE;
|
||||
FirstStrictWWM = IE;
|
||||
}
|
||||
|
||||
if (II == IE)
|
||||
@ -1306,7 +1310,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
|
||||
II = Next;
|
||||
}
|
||||
assert(!SavedWQMReg);
|
||||
assert(!SavedNonWWMReg);
|
||||
assert(!SavedNonStrictWWMReg);
|
||||
}
|
||||
|
||||
void SIWholeQuadMode::lowerLiveMaskQueries() {
|
||||
@ -1438,9 +1442,10 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
|
||||
|
||||
LiveMaskReg = Exec;
|
||||
|
||||
// Shader is simple does not need WQM/WWM or any complex lowering
|
||||
if (!(GlobalFlags & (StateWQM | StateWWM)) && LowerToCopyInstrs.empty() &&
|
||||
LowerToMovInstrs.empty() && KillInstrs.empty()) {
|
||||
// Shader is simple does not need WQM/StrictWWM or any complex lowering
|
||||
if (!(GlobalFlags & (StateWQM | StateStrictWWM)) &&
|
||||
LowerToCopyInstrs.empty() && LowerToMovInstrs.empty() &&
|
||||
KillInstrs.empty()) {
|
||||
lowerLiveMaskQueries();
|
||||
return !LiveMaskQueries.empty();
|
||||
}
|
||||
|
@ -1,13 +1,15 @@
|
||||
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
|
||||
; RUN: llc -global-isel -march=amdgcn -mcpu=hawaii -stop-after=instruction-select -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
||||
|
||||
; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
|
||||
|
||||
define amdgpu_ps float @wwm_f32(float %val) {
|
||||
; GCN-LABEL: name: wwm_f32
|
||||
; GCN: bb.1 (%ir-block.0):
|
||||
; GCN: liveins: $vgpr0
|
||||
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN: [[WWM:%[0-9]+]]:vgpr_32 = WWM [[COPY]], implicit $exec
|
||||
; GCN: $vgpr0 = COPY [[WWM]]
|
||||
; GCN: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY]], implicit $exec
|
||||
; GCN: $vgpr0 = COPY [[STRICT_WWM]]
|
||||
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0
|
||||
%ret = call float @llvm.amdgcn.wwm.f32(float %val)
|
||||
ret float %ret
|
||||
@ -18,8 +20,8 @@ define amdgpu_ps float @wwm_v2f16(float %arg) {
|
||||
; GCN: bb.1 (%ir-block.0):
|
||||
; GCN: liveins: $vgpr0
|
||||
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN: [[WWM:%[0-9]+]]:vgpr_32 = WWM [[COPY]], implicit $exec
|
||||
; GCN: $vgpr0 = COPY [[WWM]]
|
||||
; GCN: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY]], implicit $exec
|
||||
; GCN: $vgpr0 = COPY [[STRICT_WWM]]
|
||||
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0
|
||||
%val = bitcast float %arg to <2 x half>
|
||||
%ret = call <2 x half> @llvm.amdgcn.wwm.v2f16(<2 x half> %val)
|
||||
@ -34,9 +36,9 @@ define amdgpu_ps <2 x float> @wwm_f64(double %val) {
|
||||
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
|
||||
; GCN: [[WWM:%[0-9]+]]:vreg_64 = WWM [[REG_SEQUENCE]], implicit $exec
|
||||
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[WWM]].sub0
|
||||
; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[WWM]].sub1
|
||||
; GCN: [[STRICT_WWM:%[0-9]+]]:vreg_64 = STRICT_WWM [[REG_SEQUENCE]], implicit $exec
|
||||
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[STRICT_WWM]].sub0
|
||||
; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[STRICT_WWM]].sub1
|
||||
; GCN: $vgpr0 = COPY [[COPY2]]
|
||||
; GCN: $vgpr1 = COPY [[COPY3]]
|
||||
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
|
||||
@ -61,10 +63,10 @@ define amdgpu_ps <3 x float> @wwm_v3f32(<3 x float> %val) {
|
||||
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
|
||||
; GCN: [[WWM:%[0-9]+]]:vreg_96 = WWM [[REG_SEQUENCE]], implicit $exec
|
||||
; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[WWM]].sub0
|
||||
; GCN: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[WWM]].sub1
|
||||
; GCN: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[WWM]].sub2
|
||||
; GCN: [[STRICT_WWM:%[0-9]+]]:vreg_96 = STRICT_WWM [[REG_SEQUENCE]], implicit $exec
|
||||
; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[STRICT_WWM]].sub0
|
||||
; GCN: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[STRICT_WWM]].sub1
|
||||
; GCN: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[STRICT_WWM]].sub2
|
||||
; GCN: $vgpr0 = COPY [[COPY3]]
|
||||
; GCN: $vgpr1 = COPY [[COPY4]]
|
||||
; GCN: $vgpr2 = COPY [[COPY5]]
|
||||
@ -73,10 +75,87 @@ define amdgpu_ps <3 x float> @wwm_v3f32(<3 x float> %val) {
|
||||
ret <3 x float> %ret
|
||||
}
|
||||
|
||||
define amdgpu_ps float @strict_wwm_f32(float %val) {
|
||||
; GCN-LABEL: name: strict_wwm_f32
|
||||
; GCN: bb.1 (%ir-block.0):
|
||||
; GCN: liveins: $vgpr0
|
||||
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY]], implicit $exec
|
||||
; GCN: $vgpr0 = COPY [[STRICT_WWM]]
|
||||
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0
|
||||
%ret = call float @llvm.amdgcn.strict.wwm.f32(float %val)
|
||||
ret float %ret
|
||||
}
|
||||
|
||||
define amdgpu_ps float @strict_wwm_v2f16(float %arg) {
|
||||
; GCN-LABEL: name: strict_wwm_v2f16
|
||||
; GCN: bb.1 (%ir-block.0):
|
||||
; GCN: liveins: $vgpr0
|
||||
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN: [[STRICT_WWM:%[0-9]+]]:vgpr_32 = STRICT_WWM [[COPY]], implicit $exec
|
||||
; GCN: $vgpr0 = COPY [[STRICT_WWM]]
|
||||
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0
|
||||
%val = bitcast float %arg to <2 x half>
|
||||
%ret = call <2 x half> @llvm.amdgcn.strict.wwm.v2f16(<2 x half> %val)
|
||||
%bc = bitcast <2 x half> %ret to float
|
||||
ret float %bc
|
||||
}
|
||||
|
||||
define amdgpu_ps <2 x float> @strict_wwm_f64(double %val) {
|
||||
; GCN-LABEL: name: strict_wwm_f64
|
||||
; GCN: bb.1 (%ir-block.0):
|
||||
; GCN: liveins: $vgpr0, $vgpr1
|
||||
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1
|
||||
; GCN: [[STRICT_WWM:%[0-9]+]]:vreg_64 = STRICT_WWM [[REG_SEQUENCE]], implicit $exec
|
||||
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[STRICT_WWM]].sub0
|
||||
; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[STRICT_WWM]].sub1
|
||||
; GCN: $vgpr0 = COPY [[COPY2]]
|
||||
; GCN: $vgpr1 = COPY [[COPY3]]
|
||||
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
|
||||
%ret = call double @llvm.amdgcn.strict.wwm.f64(double %val)
|
||||
%bitcast = bitcast double %ret to <2 x float>
|
||||
ret <2 x float> %bitcast
|
||||
}
|
||||
|
||||
; TODO
|
||||
; define amdgpu_ps float @strict_wwm_i1_vcc(float %val) {
|
||||
; %vcc = fcmp oeq float %val, 0.0
|
||||
; %ret = call i1 @llvm.amdgcn.strict.wwm.i1(i1 %vcc)
|
||||
; %select = select i1 %ret, float 1.0, float 0.0
|
||||
; ret float %select
|
||||
; }
|
||||
|
||||
define amdgpu_ps <3 x float> @strict_wwm_v3f32(<3 x float> %val) {
|
||||
; GCN-LABEL: name: strict_wwm_v3f32
|
||||
; GCN: bb.1 (%ir-block.0):
|
||||
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2
|
||||
; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GCN: [[REG_SEQUENCE:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2
|
||||
; GCN: [[STRICT_WWM:%[0-9]+]]:vreg_96 = STRICT_WWM [[REG_SEQUENCE]], implicit $exec
|
||||
; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[STRICT_WWM]].sub0
|
||||
; GCN: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[STRICT_WWM]].sub1
|
||||
; GCN: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[STRICT_WWM]].sub2
|
||||
; GCN: $vgpr0 = COPY [[COPY3]]
|
||||
; GCN: $vgpr1 = COPY [[COPY4]]
|
||||
; GCN: $vgpr2 = COPY [[COPY5]]
|
||||
; GCN: SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1, implicit $vgpr2
|
||||
%ret = call <3 x float> @llvm.amdgcn.strict.wwm.v3f32(<3 x float> %val)
|
||||
ret <3 x float> %ret
|
||||
}
|
||||
|
||||
declare i1 @llvm.amdgcn.wwm.i1(i1) #0
|
||||
declare float @llvm.amdgcn.wwm.f32(float) #0
|
||||
declare <2 x half> @llvm.amdgcn.wwm.v2f16(<2 x half>) #0
|
||||
declare <3 x float> @llvm.amdgcn.wwm.v3f32(<3 x float>) #0
|
||||
declare double @llvm.amdgcn.wwm.f64(double) #0
|
||||
declare i1 @llvm.amdgcn.strict.wwm.i1(i1) #0
|
||||
declare float @llvm.amdgcn.strict.wwm.f32(float) #0
|
||||
declare <2 x half> @llvm.amdgcn.strict.wwm.v2f16(<2 x half>) #0
|
||||
declare <3 x float> @llvm.amdgcn.strict.wwm.v3f32(<3 x float>) #0
|
||||
declare double @llvm.amdgcn.strict.wwm.f64(double) #0
|
||||
|
||||
attributes #0 = { nounwind readnone speculatable }
|
||||
|
@ -3,30 +3,30 @@
|
||||
# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=regbankselect -regbankselect-greedy -verify-machineinstrs -o - %s | FileCheck %s
|
||||
|
||||
---
|
||||
name: wwm_s
|
||||
name: strict_wwm_s
|
||||
legalized: true
|
||||
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $sgpr0
|
||||
; CHECK-LABEL: name: wwm_s
|
||||
; CHECK-LABEL: name: strict_wwm_s
|
||||
; CHECK: [[COPY:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
|
||||
; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY [[COPY]](s32)
|
||||
; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.wwm), [[COPY1]](s32)
|
||||
; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.strict.wwm), [[COPY1]](s32)
|
||||
%0:_(s32) = COPY $sgpr0
|
||||
%1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.wwm), %0
|
||||
%1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.strict.wwm), %0
|
||||
...
|
||||
|
||||
---
|
||||
name: wwm_v
|
||||
name: strict_wwm_v
|
||||
legalized: true
|
||||
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: $vgpr0
|
||||
; CHECK-LABEL: name: wwm_v
|
||||
; CHECK-LABEL: name: strict_wwm_v
|
||||
; CHECK: [[COPY:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
|
||||
; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.wwm), [[COPY]](s32)
|
||||
; CHECK: [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.strict.wwm), [[COPY]](s32)
|
||||
%0:_(s32) = COPY $vgpr0
|
||||
%1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.wwm), %0
|
||||
%1:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.strict.wwm), %0
|
||||
...
|
||||
|
@ -1,6 +1,9 @@
|
||||
; RUN: llc -mtriple=amdgcn--amdpal -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
||||
|
||||
define amdgpu_hs void @foo(i32 inreg %arg, <4 x i32> inreg %buffer) {
|
||||
; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
|
||||
|
||||
; GCN-LABEL: wwm:
|
||||
define amdgpu_hs void @wwm(i32 inreg %arg, <4 x i32> inreg %buffer) {
|
||||
entry:
|
||||
br label %work
|
||||
|
||||
@ -36,8 +39,46 @@ work:
|
||||
br i1 %tmp34, label %bb602, label %bb42
|
||||
}
|
||||
|
||||
; GCN-LABEL: strict_wwm:
|
||||
define amdgpu_hs void @strict_wwm(i32 inreg %arg, <4 x i32> inreg %buffer) {
|
||||
entry:
|
||||
br label %work
|
||||
|
||||
bb42:
|
||||
br label %bb602
|
||||
|
||||
bb602:
|
||||
%tmp603 = phi i32 [ 0, %bb42 ], [ 1, %work ]
|
||||
%tmp607 = icmp eq i32 %tmp603, %tmp1196
|
||||
br i1 %tmp607, label %bb49, label %bb54
|
||||
|
||||
bb49:
|
||||
call void @llvm.amdgcn.raw.tbuffer.store.f32(float 1.0, <4 x i32> %buffer, i32 4, i32 1, i32 116, i32 1)
|
||||
ret void
|
||||
|
||||
bb54:
|
||||
ret void
|
||||
|
||||
work:
|
||||
; GCN: s_not_b64 exec, exec
|
||||
; GCN: v_mov_b32_e32 v[[tmp1189:[0-9]+]], 1
|
||||
; GCN: s_not_b64 exec, exec
|
||||
%tmp1189 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 4, i32 1)
|
||||
|
||||
; GCN: s_or_saveexec_b64 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, -1
|
||||
; GCN: v_lshlrev_b32_e32 v[[tmp1191:[0-9]+]], 2, v[[tmp1189]]
|
||||
%tmp1191 = mul i32 %tmp1189, 4
|
||||
|
||||
; GCN: s_mov_b64 exec, s{{\[}}[[LO]]:[[HI]]{{\]}}
|
||||
%tmp1196 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp1191)
|
||||
|
||||
%tmp34 = icmp eq i32 %arg, 0
|
||||
br i1 %tmp34, label %bb602, label %bb42
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #0
|
||||
declare i32 @llvm.amdgcn.wwm.i32(i32) #1
|
||||
declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #1
|
||||
declare void @llvm.amdgcn.raw.tbuffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg, i32 immarg) #2
|
||||
|
||||
attributes #0 = { convergent nounwind readnone willreturn }
|
||||
|
@ -77,7 +77,8 @@ main_body:
|
||||
ret float %out.0
|
||||
}
|
||||
|
||||
; Make sure the transition from Exact to WWM then softwqm does not trigger WQM.
|
||||
; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
|
||||
; Make sure the transition from Exact to STRICT_WWM then softwqm does not trigger WQM.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_wwm1:
|
||||
;CHECK: s_or_saveexec_b64 [[ORIG0:s\[[0-9]+:[0-9]+\]]], -1
|
||||
@ -101,6 +102,31 @@ main_body:
|
||||
ret float %out.0
|
||||
}
|
||||
|
||||
; Make sure the transition from Exact to STRICT_WWM then softwqm does not trigger WQM.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_strict_wwm1:
|
||||
;CHECK: s_or_saveexec_b64 [[ORIG0:s\[[0-9]+:[0-9]+\]]], -1
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: s_mov_b64 exec, [[ORIG0]]
|
||||
;CHECK: buffer_store_dword
|
||||
;CHECK: s_or_saveexec_b64 [[ORIG1:s\[[0-9]+:[0-9]+\]]], -1
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: v_add_f32_e32
|
||||
;CHECK: s_mov_b64 exec, [[ORIG1]]
|
||||
;CHECK-NOT: s_wqm_b64
|
||||
define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
|
||||
main_body:
|
||||
%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
|
||||
call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
|
||||
%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
|
||||
%temp = fadd float %src0, %src1
|
||||
%temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
|
||||
%out = fadd float %temp.0, %temp.0
|
||||
%out.0 = call float @llvm.amdgcn.softwqm.f32(float %out)
|
||||
ret float %out.0
|
||||
}
|
||||
|
||||
|
||||
; Check that softwqm on one case of branch does not trigger WQM for shader.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_control_flow_0:
|
||||
@ -183,6 +209,7 @@ declare void @llvm.amdgcn.kill(i1) #1
|
||||
declare float @llvm.amdgcn.wqm.f32(float) #3
|
||||
declare float @llvm.amdgcn.softwqm.f32(float) #3
|
||||
declare i32 @llvm.amdgcn.softwqm.i32(i32) #3
|
||||
declare float @llvm.amdgcn.strict.wwm.f32(float) #3
|
||||
declare float @llvm.amdgcn.wwm.f32(float) #3
|
||||
|
||||
attributes #1 = { nounwind }
|
||||
|
@ -701,6 +701,7 @@ break:
|
||||
ret <4 x float> %c.iv
|
||||
}
|
||||
|
||||
; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
|
||||
; GCN-LABEL: {{^}}test_wwm1:
|
||||
; GFX1032: s_or_saveexec_b32 [[SAVE:s[0-9]+]], -1
|
||||
; GFX1032: s_mov_b32 exec_lo, [[SAVE]]
|
||||
@ -744,6 +745,50 @@ endif:
|
||||
ret float %out.2
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_strict_wwm1:
|
||||
; GFX1032: s_or_saveexec_b32 [[SAVE:s[0-9]+]], -1
|
||||
; GFX1032: s_mov_b32 exec_lo, [[SAVE]]
|
||||
; GFX1064: s_or_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], -1
|
||||
; GFX1064: s_mov_b64 exec, [[SAVE]]
|
||||
define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1, float %src0, float %src1) {
|
||||
main_body:
|
||||
%out = fadd float %src0, %src1
|
||||
%out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
|
||||
ret float %out.0
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_strict_wwm2:
|
||||
; GFX1032: v_cmp_gt_u32_e32 vcc_lo, 32, v{{[0-9]+}}
|
||||
; GFX1032: s_and_saveexec_b32 [[SAVE1:s[0-9]+]], vcc_lo
|
||||
; GFX1032: s_or_saveexec_b32 [[SAVE2:s[0-9]+]], -1
|
||||
; GFX1032: s_mov_b32 exec_lo, [[SAVE2]]
|
||||
; GFX1032: s_or_b32 exec_lo, exec_lo, [[SAVE1]]
|
||||
; GFX1064: v_cmp_gt_u32_e32 vcc, 32, v{{[0-9]+}}
|
||||
; GFX1064: s_and_saveexec_b64 [[SAVE1:s\[[0-9:]+\]]], vcc{{$}}
|
||||
; GFX1064: s_or_saveexec_b64 [[SAVE2:s\[[0-9:]+\]]], -1
|
||||
; GFX1064: s_mov_b64 exec, [[SAVE2]]
|
||||
; GFX1064: s_or_b64 exec, exec, [[SAVE1]]
|
||||
define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx) {
|
||||
main_body:
|
||||
; use mbcnt to make sure the branch is divergent
|
||||
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
|
||||
%cc = icmp uge i32 %hi, 32
|
||||
br i1 %cc, label %endif, label %if
|
||||
|
||||
if:
|
||||
%src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
|
||||
%out = fadd float %src, %src
|
||||
%out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
|
||||
%out.1 = fadd float %src, %out.0
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
%out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
|
||||
ret float %out.2
|
||||
}
|
||||
|
||||
|
||||
; GCN-LABEL: {{^}}test_wqm1:
|
||||
; GFX1032: s_mov_b32 [[ORIG:s[0-9]+]], exec_lo
|
||||
; GFX1032: s_wqm_b32 exec_lo, exec_lo
|
||||
@ -1123,6 +1168,7 @@ declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32)
|
||||
declare i64 @llvm.amdgcn.set.inactive.i64(i64, i64)
|
||||
declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32)
|
||||
declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32)
|
||||
declare float @llvm.amdgcn.strict.wwm.f32(float)
|
||||
declare float @llvm.amdgcn.wwm.f32(float)
|
||||
declare i32 @llvm.amdgcn.wqm.i32(i32)
|
||||
declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32)
|
||||
|
@ -146,6 +146,8 @@ main_body:
|
||||
ret float %out.2
|
||||
}
|
||||
|
||||
; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
|
||||
|
||||
; Check that WWM is triggered by the wwm intrinsic.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_wwm1:
|
||||
@ -331,14 +333,14 @@ endloop:
|
||||
|
||||
; Check that @llvm.amdgcn.set.inactive disables WWM.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_set_inactive1:
|
||||
;CHECK-LABEL: {{^}}test_wwm_set_inactive1:
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: s_not_b64 exec, exec
|
||||
;CHECK: v_mov_b32_e32
|
||||
;CHECK: s_not_b64 exec, exec
|
||||
;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
|
||||
;CHECK: v_add_{{[iu]}}32_e32
|
||||
define amdgpu_ps void @test_set_inactive1(i32 inreg %idx) {
|
||||
define amdgpu_ps void @test_wwm_set_inactive1(i32 inreg %idx) {
|
||||
main_body:
|
||||
%src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
|
||||
%src.0 = bitcast float %src to i32
|
||||
@ -822,6 +824,243 @@ ENDIF:
|
||||
ret float %r
|
||||
}
|
||||
|
||||
; Check that WWM is triggered by the strict_wwm intrinsic.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_strict_wwm1:
|
||||
;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: v_add_f32_e32
|
||||
define amdgpu_ps float @test_strict_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
|
||||
main_body:
|
||||
%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
|
||||
%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
|
||||
%out = fadd float %src0, %src1
|
||||
%out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
|
||||
ret float %out.0
|
||||
}
|
||||
|
||||
; Same as above, but with an integer type.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_strict_wwm2:
|
||||
;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: v_add_{{[iu]}}32_e32
|
||||
define amdgpu_ps float @test_strict_wwm2(i32 inreg %idx0, i32 inreg %idx1) {
|
||||
main_body:
|
||||
%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
|
||||
%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
|
||||
%src0.0 = bitcast float %src0 to i32
|
||||
%src1.0 = bitcast float %src1 to i32
|
||||
%out = add i32 %src0.0, %src1.0
|
||||
%out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out)
|
||||
%out.1 = bitcast i32 %out.0 to float
|
||||
ret float %out.1
|
||||
}
|
||||
|
||||
; Check that we don't leave WWM on for computations that don't require WWM,
|
||||
; since that will lead clobbering things that aren't supposed to be clobbered
|
||||
; in cases like this.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_strict_wwm3:
|
||||
;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: v_add_f32_e32
|
||||
;CHECK: s_mov_b64 exec, [[ORIG]]
|
||||
;CHECK: v_add_f32_e32
|
||||
define amdgpu_ps float @test_strict_wwm3(i32 inreg %idx) {
|
||||
main_body:
|
||||
; use mbcnt to make sure the branch is divergent
|
||||
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
|
||||
%cc = icmp uge i32 %hi, 32
|
||||
br i1 %cc, label %endif, label %if
|
||||
|
||||
if:
|
||||
%src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
|
||||
%out = fadd float %src, %src
|
||||
%out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
|
||||
%out.1 = fadd float %src, %out.0
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
%out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
|
||||
ret float %out.2
|
||||
}
|
||||
|
||||
; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
|
||||
; write could clobber disabled channels in the non-WWM one.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_strict_wwm4:
|
||||
;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: v_add_f32_e32
|
||||
;CHECK: s_mov_b64 exec, [[ORIG]]
|
||||
;CHECK-NEXT: v_mov_b32_e32
|
||||
define amdgpu_ps float @test_strict_wwm4(i32 inreg %idx) {
|
||||
main_body:
|
||||
; use mbcnt to make sure the branch is divergent
|
||||
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
|
||||
%cc = icmp uge i32 %hi, 32
|
||||
br i1 %cc, label %endif, label %if
|
||||
|
||||
if:
|
||||
%src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
|
||||
%out = fadd float %src, %src
|
||||
%out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
%out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
|
||||
ret float %out.1
|
||||
}
|
||||
|
||||
; Make sure the transition from Exact to WWM then WQM works properly.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_strict_wwm5:
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: buffer_store_dword
|
||||
;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: v_add_f32_e32
|
||||
;CHECK: s_mov_b64 exec, [[ORIG]]
|
||||
;CHECK: s_wqm_b64 exec, exec
|
||||
define amdgpu_ps float @test_strict_wwm5(i32 inreg %idx0, i32 inreg %idx1) {
|
||||
main_body:
|
||||
%src0 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
|
||||
call void @llvm.amdgcn.struct.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i32 0, i32 0)
|
||||
%src1 = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i32 0, i32 0)
|
||||
%temp = fadd float %src1, %src1
|
||||
%temp.0 = call float @llvm.amdgcn.strict.wwm.f32(float %temp)
|
||||
%out = fadd float %temp.0, %temp.0
|
||||
%out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
|
||||
ret float %out.0
|
||||
}
|
||||
|
||||
; Check that WWM is turned on correctly across basic block boundaries.
|
||||
; if..then..endif version
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_strict_wwm6_then:
|
||||
;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
|
||||
;SI-CHECK: buffer_load_dword
|
||||
;VI-CHECK: flat_load_dword
|
||||
;CHECK: s_mov_b64 exec, [[ORIG]]
|
||||
;CHECK: %if
|
||||
;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1
|
||||
;SI-CHECK: buffer_load_dword
|
||||
;VI-CHECK: flat_load_dword
|
||||
;CHECK: v_add_f32_e32
|
||||
;CHECK: s_mov_b64 exec, [[ORIG2]]
|
||||
define amdgpu_ps float @test_strict_wwm6_then() {
|
||||
main_body:
|
||||
%src0 = load volatile float, float addrspace(1)* undef
|
||||
; use mbcnt to make sure the branch is divergent
|
||||
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
|
||||
%cc = icmp uge i32 %hi, 32
|
||||
br i1 %cc, label %endif, label %if
|
||||
|
||||
if:
|
||||
%src1 = load volatile float, float addrspace(1)* undef
|
||||
%out = fadd float %src0, %src1
|
||||
%out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
|
||||
br label %endif
|
||||
|
||||
endif:
|
||||
%out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
|
||||
ret float %out.1
|
||||
}
|
||||
|
||||
; Check that WWM is turned on correctly across basic block boundaries.
|
||||
; loop version
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_strict_wwm6_loop:
|
||||
;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
|
||||
;SI-CHECK: buffer_load_dword
|
||||
;VI-CHECK: flat_load_dword
|
||||
;CHECK: s_mov_b64 exec, [[ORIG]]
|
||||
;CHECK: %loop
|
||||
;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1
|
||||
;SI-CHECK: buffer_load_dword
|
||||
;VI-CHECK: flat_load_dword
|
||||
;CHECK: s_mov_b64 exec, [[ORIG2]]
|
||||
define amdgpu_ps float @test_strict_wwm6_loop() {
|
||||
main_body:
|
||||
%src0 = load volatile float, float addrspace(1)* undef
|
||||
; use mbcnt to make sure the branch is divergent
|
||||
%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
|
||||
%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
|
||||
br label %loop
|
||||
|
||||
loop:
|
||||
%counter = phi i32 [ %lo, %main_body ], [ %counter.1, %loop ]
|
||||
%src1 = load volatile float, float addrspace(1)* undef
|
||||
%out = fadd float %src0, %src1
|
||||
%out.0 = call float @llvm.amdgcn.strict.wwm.f32(float %out)
|
||||
%counter.1 = sub i32 %counter, 1
|
||||
%cc = icmp ne i32 %counter.1, 0
|
||||
br i1 %cc, label %loop, label %endloop
|
||||
|
||||
endloop:
|
||||
ret float %out.0
|
||||
}
|
||||
|
||||
; Check that @llvm.amdgcn.set.inactive disables WWM.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_strict_wwm_set_inactive1:
|
||||
;CHECK: buffer_load_dword
|
||||
;CHECK: s_not_b64 exec, exec
|
||||
;CHECK: v_mov_b32_e32
|
||||
;CHECK: s_not_b64 exec, exec
|
||||
;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
|
||||
;CHECK: v_add_{{[iu]}}32_e32
|
||||
define amdgpu_ps void @test_strict_wwm_set_inactive1(i32 inreg %idx) {
|
||||
main_body:
|
||||
%src = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
|
||||
%src.0 = bitcast float %src to i32
|
||||
%src.1 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %src.0, i32 0)
|
||||
%out = add i32 %src.1, %src.1
|
||||
%out.0 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %out)
|
||||
%out.1 = bitcast i32 %out.0 to float
|
||||
call void @llvm.amdgcn.struct.buffer.store.f32(float %out.1, <4 x i32> undef, i32 %idx, i32 0, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Check a case of a block being entirely WQM except for a bit of WWM.
|
||||
; There was a bug where it forgot to enter and leave WWM.
|
||||
;
|
||||
;CHECK-LABEL: {{^}}test_strict_wwm_within_wqm:
|
||||
;CHECK: %IF
|
||||
;CHECK: s_or_saveexec_b64 {{.*}}, -1
|
||||
;CHECK: ds_swizzle
|
||||
;
|
||||
define amdgpu_ps float @test_strict_wwm_within_wqm(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %c, i32 %z, float %data) {
|
||||
main_body:
|
||||
%c.bc = bitcast i32 %c to float
|
||||
%tex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
||||
%tex0 = extractelement <4 x float> %tex, i32 0
|
||||
%dtex = call <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32 15, float %tex0, <8 x i32> %rsrc, <4 x i32> %sampler, i1 false, i32 0, i32 0) #0
|
||||
%cmp = icmp eq i32 %z, 0
|
||||
br i1 %cmp, label %IF, label %ENDIF
|
||||
|
||||
IF:
|
||||
%dataf = extractelement <4 x float> %dtex, i32 0
|
||||
%data1 = fptosi float %dataf to i32
|
||||
%data2 = call i32 @llvm.amdgcn.set.inactive.i32(i32 %data1, i32 0)
|
||||
%data3 = call i32 @llvm.amdgcn.ds.swizzle(i32 %data2, i32 2079)
|
||||
%data4 = call i32 @llvm.amdgcn.strict.wwm.i32(i32 %data3)
|
||||
%data4f = sitofp i32 %data4 to float
|
||||
br label %ENDIF
|
||||
|
||||
ENDIF:
|
||||
%r = phi float [ %data4f, %IF ], [ 0.0, %main_body ]
|
||||
ret float %r
|
||||
}
|
||||
|
||||
|
||||
|
||||
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
|
||||
declare void @llvm.amdgcn.image.store.1d.v4f32.i32(<4 x float>, i32, i32, <8 x i32>, i32, i32) #1
|
||||
|
||||
@ -838,6 +1077,8 @@ declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8
|
||||
declare void @llvm.amdgcn.kill(i1) #1
|
||||
declare float @llvm.amdgcn.wqm.f32(float) #3
|
||||
declare i32 @llvm.amdgcn.wqm.i32(i32) #3
|
||||
declare float @llvm.amdgcn.strict.wwm.f32(float) #3
|
||||
declare i32 @llvm.amdgcn.strict.wwm.i32(i32) #3
|
||||
declare float @llvm.amdgcn.wwm.f32(float) #3
|
||||
declare i32 @llvm.amdgcn.wwm.i32(i32) #3
|
||||
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32) #4
|
||||
|
@ -3,10 +3,10 @@
|
||||
---
|
||||
# Check for awareness that s_or_saveexec_b64 clobbers SCC
|
||||
#
|
||||
#CHECK: ENTER_WWM
|
||||
#CHECK: ENTER_STRICT_WWM
|
||||
#CHECK: S_CMP_LT_I32
|
||||
#CHECK: S_CSELECT_B32
|
||||
name: test_wwm_scc
|
||||
name: test_strict_wwm_scc
|
||||
alignment: 1
|
||||
exposesReturnsTwice: false
|
||||
legalized: false
|
||||
@ -44,7 +44,7 @@ body: |
|
||||
%12 = V_ADD_CO_U32_e32 %3, %3, implicit-def $vcc, implicit $exec
|
||||
%5 = S_CSELECT_B32 %2, %1, implicit $scc
|
||||
%11 = V_ADD_CO_U32_e32 %5, %12, implicit-def $vcc, implicit $exec
|
||||
$vgpr0 = WWM %11, implicit $exec
|
||||
$vgpr0 = STRICT_WWM %11, implicit $exec
|
||||
SI_RETURN_TO_EPILOG $vgpr0
|
||||
|
||||
...
|
||||
@ -56,10 +56,10 @@ body: |
|
||||
#CHECK: %bb.1
|
||||
#CHECK: S_CMP_LT_I32
|
||||
#CHECK: COPY $scc
|
||||
#CHECK: ENTER_WWM
|
||||
#CHECK: ENTER_STRICT_WWM
|
||||
#CHECK: $scc = COPY
|
||||
#CHECK: S_CSELECT_B32
|
||||
name: test_wwm_scc2
|
||||
name: test_strict_wwm_scc2
|
||||
tracksRegLiveness: true
|
||||
body: |
|
||||
bb.0:
|
||||
@ -77,7 +77,7 @@ body: |
|
||||
%12:vgpr_32 = V_ADD_CO_U32_e32 %3:vgpr_32, %3:vgpr_32, implicit-def $vcc, implicit $exec
|
||||
%5:sgpr_32 = S_CSELECT_B32 %2:sgpr_32, %1:sgpr_32, implicit $scc
|
||||
%11:vgpr_32 = V_ADD_CO_U32_e32 %5:sgpr_32, %12:vgpr_32, implicit-def $vcc, implicit $exec
|
||||
$vgpr0 = WWM %11:vgpr_32, implicit $exec
|
||||
$vgpr0 = STRICT_WWM %11:vgpr_32, implicit $exec
|
||||
$vgpr1 = COPY %10:vgpr_32
|
||||
SI_RETURN_TO_EPILOG $vgpr0, $vgpr1
|
||||
|
||||
@ -136,19 +136,19 @@ body: |
|
||||
%10:vgpr_32 = V_SET_INACTIVE_B32 %11, undef %12:sreg_32, implicit $exec, implicit-def $scc
|
||||
%14:vgpr_32 = COPY %7
|
||||
%13:vgpr_32 = V_MOV_B32_dpp %14, killed %10, 323, 12, 15, 0, implicit $exec
|
||||
early-clobber %15:vgpr_32 = WWM killed %13, implicit $exec
|
||||
early-clobber %15:vgpr_32 = STRICT_WWM killed %13, implicit $exec
|
||||
BUFFER_STORE_DWORD_OFFSET_exact killed %15, %6, %7, 4, 0, 0, 0, 0, 0, 0, implicit $exec
|
||||
S_ENDPGM 0
|
||||
|
||||
...
|
||||
|
||||
---
|
||||
# Ensure that wwm is not put around an EXEC copy
|
||||
# Ensure that strict_wwm is not put around an EXEC copy
|
||||
#CHECK-LABEL: name: copy_exec
|
||||
#CHECK: %7:sreg_64 = COPY $exec
|
||||
#CHECK-NEXT: %14:sreg_64 = ENTER_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
#CHECK-NEXT: %14:sreg_64 = ENTER_STRICT_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec
|
||||
#CHECK-NEXT: %8:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
|
||||
#CHECK-NEXT: $exec = EXIT_WWM %14
|
||||
#CHECK-NEXT: $exec = EXIT_STRICT_WWM %14
|
||||
#CHECK-NEXT: %9:vgpr_32 = V_MBCNT_LO_U32_B32_e64 %7.sub0, 0, implicit $exec
|
||||
name: copy_exec
|
||||
tracksRegLiveness: true
|
||||
@ -169,7 +169,7 @@ body: |
|
||||
%10:vgpr_32 = V_MBCNT_LO_U32_B32_e64 %8.sub0:sreg_64, 0, implicit $exec
|
||||
%11:vgpr_32 = V_MOV_B32_dpp %9:vgpr_32, %10:vgpr_32, 312, 15, 15, 0, implicit $exec
|
||||
%12:sreg_32 = V_READLANE_B32 %11:vgpr_32, 63
|
||||
early-clobber %13:sreg_32 = WWM %9:vgpr_32, implicit $exec
|
||||
early-clobber %13:sreg_32 = STRICT_WWM %9:vgpr_32, implicit $exec
|
||||
|
||||
%14:vgpr_32 = COPY %13
|
||||
BUFFER_STORE_DWORD_OFFSET_exact killed %14, %4, %5, 4, 0, 0, 0, 0, 0, 0, implicit $exec
|
||||
@ -237,12 +237,12 @@ body: |
|
||||
---
|
||||
# Check that unnecessary instruction do not get marked for WWM
|
||||
#
|
||||
#CHECK-NOT: ENTER_WWM
|
||||
#CHECK-NOT: ENTER_STRICT_WWM
|
||||
#CHECK: BUFFER_LOAD_DWORDX2
|
||||
#CHECK-NOT: ENTER_WWM
|
||||
#CHECK-NOT: ENTER_STRICT_WWM
|
||||
#CHECK: V_SET_INACTIVE_B32
|
||||
#CHECK: V_SET_INACTIVE_B32
|
||||
#CHECK: ENTER_WWM
|
||||
#CHECK: ENTER_STRICT_WWM
|
||||
#CHECK: V_MAX
|
||||
name: test_wwm_set_inactive_propagation
|
||||
tracksRegLiveness: true
|
||||
@ -255,7 +255,7 @@ body: |
|
||||
%2.sub0:vreg_64 = V_SET_INACTIVE_B32 %2.sub0:vreg_64, 0, implicit $exec, implicit-def $scc
|
||||
%2.sub1:vreg_64 = V_SET_INACTIVE_B32 %2.sub1:vreg_64, 0, implicit $exec, implicit-def $scc
|
||||
%3:vreg_64 = nnan nsz arcp contract reassoc nofpexcept V_MAX_F64_e64 0, %2:vreg_64, 0, %2:vreg_64, 0, 0, implicit $mode, implicit $exec
|
||||
$vgpr0 = WWM %3.sub0:vreg_64, implicit $exec
|
||||
$vgpr1 = WWM %3.sub1:vreg_64, implicit $exec
|
||||
$vgpr0 = STRICT_WWM %3.sub0:vreg_64, implicit $exec
|
||||
$vgpr1 = STRICT_WWM %3.sub1:vreg_64, implicit $exec
|
||||
SI_RETURN_TO_EPILOG $vgpr0, $vgpr1
|
||||
...
|
||||
|
@ -1,6 +1,8 @@
|
||||
; RUN: llc -O0 -march=amdgcn -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-O0 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-dpp-combine=false -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-O3 %s
|
||||
|
||||
; NOTE: llvm.amdgcn.wwm is deprecated, use llvm.amdgcn.strict.wwm instead.
|
||||
|
||||
; GFX9-LABEL: {{^}}no_cfg:
|
||||
define amdgpu_cs void @no_cfg(<4 x i32> inreg %tmp14) {
|
||||
%tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0)
|
||||
@ -187,6 +189,195 @@ define amdgpu_cs void @_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
; GFX9-LABEL: {{^}}strict_wwm_no_cfg:
|
||||
define amdgpu_cs void @strict_wwm_no_cfg(<4 x i32> inreg %tmp14) {
|
||||
%tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0)
|
||||
%tmp101 = bitcast <2 x float> %tmp100 to <2 x i32>
|
||||
%tmp102 = extractelement <2 x i32> %tmp101, i32 0
|
||||
%tmp103 = extractelement <2 x i32> %tmp101, i32 1
|
||||
%tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0)
|
||||
%tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0)
|
||||
|
||||
; GFX9: s_or_saveexec_b64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, -1
|
||||
|
||||
; GFX9-DAG: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
|
||||
; GFX9-DAG: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]]
|
||||
; GFX9-DAG: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]]
|
||||
%tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false)
|
||||
%tmp121 = add i32 %tmp105, %tmp120
|
||||
%tmp122 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp121)
|
||||
|
||||
; GFX9-DAG: v_mov_b32_dpp v[[SECOND_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
|
||||
; GFX9-DAG: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]]
|
||||
; GFX9-DAG: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]]
|
||||
%tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false)
|
||||
%tmp136 = add i32 %tmp107, %tmp135
|
||||
%tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136)
|
||||
|
||||
; GFX9-O3: v_cmp_eq_u32_e32 vcc, v[[FIRST]], v[[SECOND]]
|
||||
; GFX9-O0: v_cmp_eq_u32_e64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v[[FIRST]], v[[SECOND]]
|
||||
%tmp138 = icmp eq i32 %tmp122, %tmp137
|
||||
%tmp139 = sext i1 %tmp138 to i32
|
||||
%tmp140 = shl nsw i32 %tmp139, 1
|
||||
%tmp141 = and i32 %tmp140, 2
|
||||
%tmp145 = bitcast i32 %tmp141 to float
|
||||
call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, <4 x i32> %tmp14, i32 4, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GFX9-LABEL: {{^}}strict_wwm_cfg:
|
||||
define amdgpu_cs void @strict_wwm_cfg(<4 x i32> inreg %tmp14, i32 %arg) {
|
||||
entry:
|
||||
%tmp100 = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %tmp14, i32 0, i32 0, i32 0)
|
||||
%tmp101 = bitcast <2 x float> %tmp100 to <2 x i32>
|
||||
%tmp102 = extractelement <2 x i32> %tmp101, i32 0
|
||||
%tmp105 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp102, i32 0)
|
||||
|
||||
; GFX9: v_mov_b32_dpp v[[FIRST_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
|
||||
; GFX9: v_add_u32_e32 v[[FIRST_ADD:[0-9]+]], v{{[0-9]+}}, v[[FIRST_MOV]]
|
||||
; GFX9: v_mov_b32_e32 v[[FIRST:[0-9]+]], v[[FIRST_ADD]]
|
||||
; GFX9-O0: buffer_store_dword v[[FIRST]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[FIRST_IMM_OFFSET:[0-9]+]]
|
||||
%tmp120 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp105, i32 323, i32 12, i32 15, i1 false)
|
||||
%tmp121 = add i32 %tmp105, %tmp120
|
||||
%tmp122 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp121)
|
||||
|
||||
%cond = icmp eq i32 %arg, 0
|
||||
br i1 %cond, label %if, label %merge
|
||||
if:
|
||||
%tmp103 = extractelement <2 x i32> %tmp101, i32 1
|
||||
%tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %tmp103, i32 0)
|
||||
|
||||
; GFX9: v_mov_b32_dpp v[[SECOND_MOV:[0-9]+]], v{{[0-9]+}} row_bcast:31 row_mask:0xc bank_mask:0xf
|
||||
; GFX9: v_add_u32_e32 v[[SECOND_ADD:[0-9]+]], v{{[0-9]+}}, v[[SECOND_MOV]]
|
||||
; GFX9: v_mov_b32_e32 v[[SECOND:[0-9]+]], v[[SECOND_ADD]]
|
||||
; GFX9-O0: buffer_store_dword v[[SECOND]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[SECOND_IMM_OFFSET:[0-9]+]]
|
||||
%tmp135 = tail call i32 @llvm.amdgcn.update.dpp.i32(i32 0, i32 %tmp107, i32 323, i32 12, i32 15, i1 false)
|
||||
%tmp136 = add i32 %tmp107, %tmp135
|
||||
%tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136)
|
||||
br label %merge
|
||||
|
||||
merge:
|
||||
%merge_value = phi i32 [ 0, %entry ], [%tmp137, %if ]
|
||||
; GFX9-O3: v_cmp_eq_u32_e32 vcc, v[[FIRST]], v[[SECOND]]
|
||||
; GFX9-O0: buffer_load_dword v[[FIRST:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[FIRST_IMM_OFFSET]]
|
||||
; GFX9-O0: buffer_load_dword v[[SECOND:[0-9]+]], off, s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, 0 offset:[[SECOND_IMM_OFFSET]]
|
||||
; GFX9-O0: v_cmp_eq_u32_e64 s{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}}, v[[FIRST]], v[[SECOND]]
|
||||
%tmp138 = icmp eq i32 %tmp122, %merge_value
|
||||
%tmp139 = sext i1 %tmp138 to i32
|
||||
%tmp140 = shl nsw i32 %tmp139, 1
|
||||
%tmp141 = and i32 %tmp140, 2
|
||||
%tmp145 = bitcast i32 %tmp141 to float
|
||||
call void @llvm.amdgcn.raw.buffer.store.f32(float %tmp145, <4 x i32> %tmp14, i32 4, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GFX9-LABEL: {{^}}strict_wwm_called:
|
||||
define hidden i32 @strict_wwm_called(i32 %a) noinline {
|
||||
; GFX9: v_add_u32_e32 v1, v0, v0
|
||||
%add = add i32 %a, %a
|
||||
; GFX9: v_mul_lo_u32 v0, v1, v0
|
||||
%mul = mul i32 %add, %a
|
||||
; GFX9: v_sub_u32_e32 v0, v0, v1
|
||||
%sub = sub i32 %mul, %add
|
||||
ret i32 %sub
|
||||
}
|
||||
|
||||
; GFX9-LABEL: {{^}}strict_wwm_call:
|
||||
define amdgpu_kernel void @strict_wwm_call(<4 x i32> inreg %tmp14, i32 inreg %arg) {
|
||||
; GFX9-DAG: s_load_dword [[ARG:s[0-9]+]]
|
||||
; GFX9-O0-DAG: s_mov_b32 s0, 0{{$}}
|
||||
; GFX9-O0-DAG: v_mov_b32_e32 v0, [[ARG]]
|
||||
; GFX9-O0-DAG: v_mov_b32_e32 v2, v0
|
||||
|
||||
; GFX9-O3: v_mov_b32_e32 v2, [[ARG]]
|
||||
|
||||
; GFX9-NEXT: s_not_b64 exec, exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v2, 0
|
||||
; GFX9-NEXT: s_not_b64 exec, exec
|
||||
%tmp107 = tail call i32 @llvm.amdgcn.set.inactive.i32(i32 %arg, i32 0)
|
||||
; GFX9: v_mov_b32_e32 v0, v2
|
||||
; GFX9: s_swappc_b64
|
||||
%tmp134 = call i32 @strict_wwm_called(i32 %tmp107)
|
||||
; GFX9: v_mov_b32_e32 v1, v0
|
||||
; GFX9: v_add_u32_e32 v1, v1, v2
|
||||
%tmp136 = add i32 %tmp134, %tmp107
|
||||
%tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136)
|
||||
; GFX9: buffer_store_dword v0
|
||||
call void @llvm.amdgcn.raw.buffer.store.i32(i32 %tmp137, <4 x i32> %tmp14, i32 4, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GFX9-LABEL: {{^}}strict_wwm_called_i64:
|
||||
define i64 @strict_wwm_called_i64(i64 %a) noinline {
|
||||
%add = add i64 %a, %a
|
||||
%mul = mul i64 %add, %a
|
||||
%sub = sub i64 %mul, %add
|
||||
ret i64 %sub
|
||||
}
|
||||
|
||||
; GFX9-LABEL: {{^}}strict_wwm_call_i64:
|
||||
define amdgpu_kernel void @strict_wwm_call_i64(<4 x i32> inreg %tmp14, i64 inreg %arg) {
|
||||
; GFX9: s_load_dwordx2 s{{\[}}[[ARG_LO:[0-9]+]]:[[ARG_HI:[0-9]+]]{{\]}}
|
||||
|
||||
; GFX9-O0: s_mov_b64 s{{\[}}[[ZERO_LO:[0-9]+]]:[[ZERO_HI:[0-9]+]]{{\]}}, 0{{$}}
|
||||
; GFX9-O0: v_mov_b32_e32 v0, s[[ARG_LO]]
|
||||
; GFX9-O0: v_mov_b32_e32 v1, s[[ARG_HI]]
|
||||
; GFX9-O0-DAG: v_mov_b32_e32 v10, v1
|
||||
; GFX9-O0-DAG: v_mov_b32_e32 v9, v0
|
||||
|
||||
; GFX9-O3-DAG: v_mov_b32_e32 v7, s[[ARG_HI]]
|
||||
; GFX9-O3-DAG: v_mov_b32_e32 v6, s[[ARG_LO]]
|
||||
|
||||
; GFX9: s_not_b64 exec, exec
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v9, s[[ZERO_LO]]
|
||||
; GFX9-O0-NEXT: v_mov_b32_e32 v10, s[[ZERO_HI]]
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v6, 0
|
||||
; GFX9-O3-NEXT: v_mov_b32_e32 v7, 0
|
||||
; GFX9-NEXT: s_not_b64 exec, exec
|
||||
%tmp107 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %arg, i64 0)
|
||||
; GFX9: s_swappc_b64
|
||||
%tmp134 = call i64 @strict_wwm_called_i64(i64 %tmp107)
|
||||
%tmp136 = add i64 %tmp134, %tmp107
|
||||
%tmp137 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp136)
|
||||
%tmp138 = bitcast i64 %tmp137 to <2 x i32>
|
||||
; GFX9: buffer_store_dwordx2
|
||||
call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %tmp138, <4 x i32> %tmp14, i32 4, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GFX9-LABEL: {{^}}strict_wwm_amdgpu_cs_main:
|
||||
define amdgpu_cs void @strict_wwm_amdgpu_cs_main(<4 x i32> inreg %desc, i32 %index) {
|
||||
%tmp17 = shl i32 %index, 5
|
||||
; GFX9: buffer_load_dwordx4
|
||||
%tmp18 = tail call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %desc, i32 %tmp17, i32 0)
|
||||
%.i0.upto1.bc = bitcast <4 x i32> %tmp18 to <2 x i64>
|
||||
%tmp19 = or i32 %tmp17, 16
|
||||
; GFX9: buffer_load_dwordx2
|
||||
%tmp20 = tail call <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32> %desc, i32 %tmp19, i32 0)
|
||||
%.i0.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 0
|
||||
%tmp22 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i0.upto1.extract, i64 9223372036854775807)
|
||||
%tmp97 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp22)
|
||||
%.i1.upto1.extract = extractelement <2 x i64> %.i0.upto1.bc, i32 1
|
||||
%tmp99 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i1.upto1.extract, i64 9223372036854775807)
|
||||
%tmp174 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp99)
|
||||
%.i25 = bitcast <2 x i32> %tmp20 to i64
|
||||
%tmp176 = tail call i64 @llvm.amdgcn.set.inactive.i64(i64 %.i25, i64 9223372036854775807)
|
||||
%tmp251 = tail call i64 @llvm.amdgcn.strict.wwm.i64(i64 %tmp176)
|
||||
%.cast = bitcast i64 %tmp97 to <2 x float>
|
||||
%.cast6 = bitcast i64 %tmp174 to <2 x float>
|
||||
%.cast7 = bitcast i64 %tmp251 to <2 x float>
|
||||
%tmp254 = shufflevector <2 x float> %.cast, <2 x float> %.cast6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
|
||||
; GFX9: buffer_store_dwordx4
|
||||
tail call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %tmp254, <4 x i32> %desc, i32 %tmp17, i32 0, i32 0)
|
||||
; GFX9: buffer_store_dwordx2
|
||||
tail call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %.cast7, <4 x i32> %desc, i32 %tmp19, i32 0, i32 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.strict.wwm.i32(i32)
|
||||
declare i64 @llvm.amdgcn.strict.wwm.i64(i64)
|
||||
declare i32 @llvm.amdgcn.wwm.i32(i32)
|
||||
declare i64 @llvm.amdgcn.wwm.i64(i64)
|
||||
declare i32 @llvm.amdgcn.set.inactive.i32(i32, i32)
|
||||
|
@ -462,9 +462,9 @@ TEST(LiveIntervalTest, EarlyClobberSubRegMoveUp) {
|
||||
liveIntervalTest(R"MIR(
|
||||
%4:sreg_32 = IMPLICIT_DEF
|
||||
%6:sreg_32 = IMPLICIT_DEF
|
||||
undef early-clobber %9.sub0:sreg_64 = WWM %4:sreg_32, implicit $exec
|
||||
undef early-clobber %9.sub0:sreg_64 = STRICT_WWM %4:sreg_32, implicit $exec
|
||||
%5:sreg_32 = S_FLBIT_I32_B32 %9.sub0:sreg_64
|
||||
early-clobber %9.sub1:sreg_64 = WWM %6:sreg_32, implicit $exec
|
||||
early-clobber %9.sub1:sreg_64 = STRICT_WWM %6:sreg_32, implicit $exec
|
||||
%7:sreg_32 = S_FLBIT_I32_B32 %9.sub1:sreg_64
|
||||
)MIR", [](MachineFunction &MF, LiveIntervals &LIS) {
|
||||
testHandleMove(MF, LIS, 4, 3);
|
||||
|
Loading…
Reference in New Issue
Block a user