mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
[AMDGPU] SDWA: add support for GFX9 in peephole pass
Summary: Added support based on merged SDWA pseudo instructions. Now peephole allow one scalar operand, omod and clamp modifiers. Added several subtarget features for GFX9 SDWA. This diff also contains changes from D34026. Depends D34026 Reviewers: vpykhtin, rampitec, arsenm Subscribers: kzhuravl, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D34241 llvm-svn: 305986
This commit is contained in:
parent
7fc4473e22
commit
076a1edc25
@ -238,6 +238,36 @@ def FeatureSDWA : SubtargetFeature<"sdwa",
|
||||
"Support SDWA (Sub-DWORD Addressing) extension"
|
||||
>;
|
||||
|
||||
def FeatureSDWAOmod : SubtargetFeature<"sdwa-omod",
|
||||
"HasSDWAOmod",
|
||||
"true",
|
||||
"Support OMod with SDWA (Sub-DWORD Addressing) extension"
|
||||
>;
|
||||
|
||||
def FeatureSDWAScalar : SubtargetFeature<"sdwa-scalar",
|
||||
"HasSDWAScalar",
|
||||
"true",
|
||||
"Support scalar register with SDWA (Sub-DWORD Addressing) extension"
|
||||
>;
|
||||
|
||||
def FeatureSDWASdst : SubtargetFeature<"sdwa-sdst",
|
||||
"HasSDWASdst",
|
||||
"true",
|
||||
"Support scalar dst for VOPC with SDWA (Sub-DWORD Addressing) extension"
|
||||
>;
|
||||
|
||||
def FeatureSDWAMac : SubtargetFeature<"sdwa-mav",
|
||||
"HasSDWAMac",
|
||||
"true",
|
||||
"Support v_mac_f32/f16 with SDWA (Sub-DWORD Addressing) extension"
|
||||
>;
|
||||
|
||||
def FeatureSDWAClampVOPC : SubtargetFeature<"sdwa-clamp-vopc",
|
||||
"HasSDWAClampVOPC",
|
||||
"true",
|
||||
"Support clamp for VOPC with SDWA (Sub-DWORD Addressing) extension"
|
||||
>;
|
||||
|
||||
def FeatureDPP : SubtargetFeature<"dpp",
|
||||
"HasDPP",
|
||||
"true",
|
||||
@ -421,8 +451,8 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
|
||||
FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
|
||||
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
|
||||
FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
|
||||
FeatureScalarStores, FeatureInv2PiInlineImm, FeatureSDWA,
|
||||
FeatureDPP
|
||||
FeatureScalarStores, FeatureInv2PiInlineImm,
|
||||
FeatureSDWA, FeatureSDWAClampVOPC, FeatureSDWAMac, FeatureDPP
|
||||
]
|
||||
>;
|
||||
|
||||
@ -432,7 +462,8 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
|
||||
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
|
||||
FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
|
||||
FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
|
||||
FeatureFastFMAF32, FeatureSDWA, FeatureDPP,
|
||||
FeatureFastFMAF32, FeatureDPP,
|
||||
FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
|
||||
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts
|
||||
]
|
||||
>;
|
||||
|
@ -124,6 +124,11 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
|
||||
HasScalarStores(false),
|
||||
HasInv2PiInlineImm(false),
|
||||
HasSDWA(false),
|
||||
HasSDWAOmod(false),
|
||||
HasSDWAScalar(false),
|
||||
HasSDWASdst(false),
|
||||
HasSDWAMac(false),
|
||||
HasSDWAClampVOPC(false),
|
||||
HasDPP(false),
|
||||
FlatAddressSpace(false),
|
||||
FlatInstOffsets(false),
|
||||
|
@ -149,6 +149,11 @@ protected:
|
||||
bool HasScalarStores;
|
||||
bool HasInv2PiInlineImm;
|
||||
bool HasSDWA;
|
||||
bool HasSDWAOmod;
|
||||
bool HasSDWAScalar;
|
||||
bool HasSDWASdst;
|
||||
bool HasSDWAMac;
|
||||
bool HasSDWAClampVOPC;
|
||||
bool HasDPP;
|
||||
bool FlatAddressSpace;
|
||||
bool FlatInstOffsets;
|
||||
@ -431,6 +436,26 @@ public:
|
||||
return HasSDWA;
|
||||
}
|
||||
|
||||
bool hasSDWAOmod() const {
|
||||
return HasSDWAOmod;
|
||||
}
|
||||
|
||||
bool hasSDWAScalar() const {
|
||||
return HasSDWAScalar;
|
||||
}
|
||||
|
||||
bool hasSDWASdst() const {
|
||||
return HasSDWASdst;
|
||||
}
|
||||
|
||||
bool hasSDWAMac() const {
|
||||
return HasSDWAMac;
|
||||
}
|
||||
|
||||
bool hasSDWAClampVOPC() const {
|
||||
return HasSDWAClampVOPC;
|
||||
}
|
||||
|
||||
/// \brief Returns the offset in bytes from the start of the input buffer
|
||||
/// of the first explicit kernel argument.
|
||||
unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
|
||||
|
@ -2454,7 +2454,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
|
||||
continue;
|
||||
const MachineOperand &MO = MI.getOperand(OpIdx);
|
||||
|
||||
if (AMDGPU::isVI(ST)) {
|
||||
if (!ST.hasSDWAScalar()) {
|
||||
// Only VGPRS on VI
|
||||
if (!MO.isReg() || !RI.hasVGPRs(RI.getRegClassForReg(MRI, MO.getReg()))) {
|
||||
ErrInfo = "Only VGPRs allowed as operands in SDWA instructions on VI";
|
||||
@ -2469,7 +2469,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
|
||||
}
|
||||
}
|
||||
|
||||
if (AMDGPU::isVI(ST)) {
|
||||
if (!ST.hasSDWAOmod()) {
|
||||
// No omod allowed on VI
|
||||
const MachineOperand *OMod = getNamedOperand(MI, AMDGPU::OpName::omod);
|
||||
if (OMod != nullptr &&
|
||||
@ -2481,14 +2481,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
|
||||
|
||||
uint16_t BasicOpcode = AMDGPU::getBasicFromSDWAOp(Opcode);
|
||||
if (isVOPC(BasicOpcode)) {
|
||||
if (AMDGPU::isVI(ST) && DstIdx != -1) {
|
||||
if (!ST.hasSDWASdst() && DstIdx != -1) {
|
||||
// Only vcc allowed as dst on VI for VOPC
|
||||
const MachineOperand &Dst = MI.getOperand(DstIdx);
|
||||
if (!Dst.isReg() || Dst.getReg() != AMDGPU::VCC) {
|
||||
ErrInfo = "Only VCC allowed as dst in SDWA instructions on VI";
|
||||
return false;
|
||||
}
|
||||
} else if (AMDGPU::isGFX9(ST)) {
|
||||
} else if (!ST.hasSDWAClampVOPC()) {
|
||||
// No clamp allowed on GFX9 for VOPC
|
||||
const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
|
||||
if (Clamp != nullptr &&
|
||||
|
@ -67,9 +67,9 @@ public:
|
||||
|
||||
bool runOnMachineFunction(MachineFunction &MF) override;
|
||||
void matchSDWAOperands(MachineFunction &MF);
|
||||
bool isConvertibleToSDWA(const MachineInstr &MI) const;
|
||||
bool isConvertibleToSDWA(const MachineInstr &MI, const SISubtarget &ST) const;
|
||||
bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
|
||||
void legalizeScalarOperands(MachineInstr &MI) const;
|
||||
void legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const;
|
||||
|
||||
StringRef getPassName() const override { return "SI Peephole SDWA"; }
|
||||
|
||||
@ -607,24 +607,38 @@ void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
|
||||
}
|
||||
}
|
||||
|
||||
bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI) const {
|
||||
bool SIPeepholeSDWA::isConvertibleToSDWA(const MachineInstr &MI,
|
||||
const SISubtarget &ST) const {
|
||||
// Check if this instruction has opcode that supports SDWA
|
||||
unsigned Opc = MI.getOpcode();
|
||||
if (AMDGPU::getSDWAOp(Opc) != -1)
|
||||
return true;
|
||||
int Opc32 = AMDGPU::getVOPe32(Opc);
|
||||
if (Opc32 != -1 && AMDGPU::getSDWAOp(Opc32) != -1) {
|
||||
if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
|
||||
int Opc = MI.getOpcode();
|
||||
if (AMDGPU::getSDWAOp(Opc) == -1)
|
||||
Opc = AMDGPU::getVOPe32(Opc);
|
||||
|
||||
if (Opc == -1 || AMDGPU::getSDWAOp(Opc) == -1)
|
||||
return false;
|
||||
|
||||
if (!ST.hasSDWAOmod() && TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
|
||||
return false;
|
||||
|
||||
if (TII->isVOPC(Opc)) {
|
||||
if (!ST.hasSDWASdst()) {
|
||||
const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
|
||||
if (SDst && SDst->getReg() != AMDGPU::VCC)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ST.hasSDWAClampVOPC() && TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
|
||||
return false;
|
||||
|
||||
if (TII->isVOPC(Opc)) {
|
||||
const MachineOperand *SDst = TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
|
||||
return SDst && SDst->getReg() == AMDGPU::VCC;
|
||||
} else {
|
||||
return !TII->getNamedOperand(MI, AMDGPU::OpName::sdst);
|
||||
}
|
||||
} else if (TII->getNamedOperand(MI, AMDGPU::OpName::sdst)) {
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
|
||||
if (!ST.hasSDWAMac() && (Opc == AMDGPU::V_MAC_F16_e32 ||
|
||||
Opc == AMDGPU::V_MAC_F32_e32))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
|
||||
@ -690,13 +704,23 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
|
||||
SDWAInst.add(*Src2);
|
||||
}
|
||||
|
||||
// Initialize clamp.
|
||||
if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1)
|
||||
// Copy clamp if present, initialize otherwise
|
||||
assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1);
|
||||
MachineOperand *Clamp = TII->getNamedOperand(MI, AMDGPU::OpName::clamp);
|
||||
if (Clamp) {
|
||||
SDWAInst.add(*Clamp);
|
||||
} else {
|
||||
SDWAInst.addImm(0);
|
||||
}
|
||||
|
||||
// Initialize omod.
|
||||
if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1)
|
||||
// Copy omod if present, initialize otherwise if needed
|
||||
MachineOperand *OMod = TII->getNamedOperand(MI, AMDGPU::OpName::omod);
|
||||
if (OMod) {
|
||||
assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1);
|
||||
SDWAInst.add(*OMod);
|
||||
} else if (AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::omod) != -1) {
|
||||
SDWAInst.addImm(0);
|
||||
}
|
||||
|
||||
// Initialize dst_sel and dst_unused if present
|
||||
if (Dst) {
|
||||
@ -750,16 +774,25 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
|
||||
}
|
||||
|
||||
// If an instruction was converted to SDWA it should not have immediates or SGPR
|
||||
// operands. Copy its scalar operands into VGPRs.
|
||||
void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI) const {
|
||||
// operands (allowed one SGPR on GFX9). Copy its scalar operands into VGPRs.
|
||||
void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI, const SISubtarget &ST) const {
|
||||
const MCInstrDesc &Desc = TII->get(MI.getOpcode());
|
||||
for (unsigned I = 0, E = MI.getNumExplicitOperands(); I != E; ++I) {
|
||||
MachineOperand &Op = MI.getOperand(I);
|
||||
unsigned ConstantBusCount = 0;
|
||||
for (MachineOperand &Op: MI.explicit_uses()) {
|
||||
if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
|
||||
continue;
|
||||
|
||||
unsigned I = MI.getOperandNo(&Op);
|
||||
if (Desc.OpInfo[I].RegClass == -1 ||
|
||||
!TRI->hasVGPRs(TRI->getRegClass(Desc.OpInfo[I].RegClass)))
|
||||
continue;
|
||||
|
||||
if (ST.hasSDWAScalar() && ConstantBusCount == 0 && Op.isReg() &&
|
||||
TRI->isSGPRReg(*MRI, Op.getReg())) {
|
||||
++ConstantBusCount;
|
||||
continue;
|
||||
}
|
||||
|
||||
unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
||||
auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
|
||||
TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
|
||||
@ -775,10 +808,8 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI) const {
|
||||
bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
|
||||
const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
|
||||
|
||||
if (!ST.hasSDWA() ||
|
||||
!AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9
|
||||
if (!ST.hasSDWA())
|
||||
return false;
|
||||
}
|
||||
|
||||
MRI = &MF.getRegInfo();
|
||||
TRI = ST.getRegisterInfo();
|
||||
@ -790,7 +821,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
|
||||
for (const auto &OperandPair : SDWAOperands) {
|
||||
const auto &Operand = OperandPair.second;
|
||||
MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
|
||||
if (PotentialMI && isConvertibleToSDWA(*PotentialMI)) {
|
||||
if (PotentialMI && isConvertibleToSDWA(*PotentialMI, ST)) {
|
||||
PotentialMatches[PotentialMI].push_back(Operand.get());
|
||||
}
|
||||
}
|
||||
@ -805,7 +836,7 @@ bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
|
||||
|
||||
bool Ret = !ConvertedInstructions.empty();
|
||||
while (!ConvertedInstructions.empty())
|
||||
legalizeScalarOperands(*ConvertedInstructions.pop_back_val());
|
||||
legalizeScalarOperands(*ConvertedInstructions.pop_back_val(), ST);
|
||||
|
||||
return Ret;
|
||||
}
|
||||
|
@ -401,10 +401,6 @@ class VOP_SDWA_Real <VOP_SDWA_Pseudo ps> :
|
||||
let Constraints = ps.Constraints;
|
||||
let DisableEncoding = ps.DisableEncoding;
|
||||
|
||||
// string Mnemonic = ps.Mnemonic;
|
||||
// string AsmOperands = ps.AsmOperands;
|
||||
// string AsmOperands9 = ps.AsmOperands9;
|
||||
|
||||
// Copy relevant pseudo op flags
|
||||
let SubtargetPredicate = ps.SubtargetPredicate;
|
||||
let AssemblerPredicate = ps.AssemblerPredicate;
|
||||
|
@ -134,11 +134,10 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x
|
||||
; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}}
|
||||
; CI-DAG: v_sub_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
|
||||
|
||||
; GFX9: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VAL]]
|
||||
; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]]
|
||||
; GFX9-DAG: v_sub_f16_e32 v{{[0-9]+}}, 2.0, [[ELT1]]
|
||||
; VI-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
|
||||
; VI-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
; GFX89-DAG: v_mov_b32_e32 [[CONST2:v[0-9]+]], 0x4000
|
||||
; GFX89-DAG: v_sub_f16_sdwa v{{[0-9]+}}, [[CONST2]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
|
||||
|
||||
define amdgpu_kernel void @v_extract_fneg_fold_v2f16(<2 x half> addrspace(1)* %in) #0 {
|
||||
%val = load <2 x half>, <2 x half> addrspace(1)* %in
|
||||
%fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
|
||||
|
@ -1,6 +1,6 @@
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI -check-prefix=SIGFX9 %s
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=SIGFX9 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
|
||||
|
||||
; GCN-LABEL: {{^}}fpext_f16_to_f32
|
||||
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
|
||||
@ -35,11 +35,10 @@ entry:
|
||||
|
||||
; GCN-LABEL: {{^}}fpext_v2f16_to_v2f32
|
||||
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||
; GFX9-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; GCN-DAG: v_cvt_f32_f16_e32 v[[R_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; SIGFX9: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||
; VI: v_cvt_f32_f16_sdwa v[[R_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; SI: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||
; GFX89: v_cvt_f32_f16_sdwa v[[R_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[R_F32_0]]:[[R_F32_1]]{{\]}}
|
||||
; GCN: s_endpgm
|
||||
|
||||
@ -55,9 +54,9 @@ entry:
|
||||
|
||||
; GCN-LABEL: {{^}}fpext_v2f16_to_v2f64
|
||||
; GCN: buffer_load_dword
|
||||
; SIGFX9-DAG: v_lshrrev_b32_e32
|
||||
; SIGFX9-DAG: v_cvt_f32_f16_e32
|
||||
; VI: v_cvt_f32_f16_sdwa
|
||||
; SI-DAG: v_lshrrev_b32_e32
|
||||
; SI-DAG: v_cvt_f32_f16_e32
|
||||
; GFX89: v_cvt_f32_f16_sdwa
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
|
||||
; GCN: v_cvt_f64_f32_e32
|
||||
|
@ -42,13 +42,13 @@ entry:
|
||||
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
|
||||
|
||||
; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; GFX9: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; GFX9: v_rndne_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
|
||||
; GFX9: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||
; GFX9: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
|
||||
; GFX9: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]]
|
||||
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
|
||||
define amdgpu_kernel void @rint_v2f16(
|
||||
<2 x half> addrspace(1)* %r,
|
||||
<2 x half> addrspace(1)* %a) {
|
||||
|
88
test/CodeGen/AMDGPU/sdwa-gfx9.mir
Normal file
88
test/CodeGen/AMDGPU/sdwa-gfx9.mir
Normal file
@ -0,0 +1,88 @@
|
||||
# RUN: llc -march=amdgcn -mcpu=kaveri -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
|
||||
# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
|
||||
|
||||
# GCN-LABEL: {{^}}name: add_shr_i32
|
||||
# GCN: [[SMOV:%[0-9]+]] = S_MOV_B32 123
|
||||
|
||||
# CI: [[SHIFT:%[0-9]+]] = V_LSHRREV_B32_e64 16, %{{[0-9]+}}, implicit %exec
|
||||
# CI: %{{[0-9]+}} = V_ADD_I32_e32 [[SMOV]], killed [[SHIFT]], implicit-def %vcc, implicit %exec
|
||||
|
||||
# VI: [[VMOV:%[0-9]+]] = V_MOV_B32_e32 [[SMOV]], implicit %exec
|
||||
# VI: %{{[0-9]+}} = V_ADD_I32_sdwa 0, [[VMOV]], 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit-def %vcc, implicit %exec
|
||||
|
||||
# GFX9: %{{[0-9]+}} = V_ADD_I32_sdwa 0, [[SMOV]], 0, %{{[0-9]+}}, 0, 6, 0, 6, 5, implicit-def %vcc, implicit %exec
|
||||
|
||||
---
|
||||
name: add_shr_i32
|
||||
tracksRegLiveness: true
|
||||
registers:
|
||||
- { id: 0, class: vreg_64 }
|
||||
- { id: 1, class: vreg_64 }
|
||||
- { id: 2, class: sreg_64 }
|
||||
- { id: 3, class: vgpr_32 }
|
||||
- { id: 4, class: sreg_32_xm0 }
|
||||
- { id: 5, class: sreg_32_xm0 }
|
||||
- { id: 6, class: sreg_32 }
|
||||
- { id: 7, class: sreg_32_xm0 }
|
||||
- { id: 8, class: sreg_32 }
|
||||
- { id: 9, class: vgpr_32 }
|
||||
- { id: 10, class: vgpr_32 }
|
||||
- { id: 11, class: vgpr_32 }
|
||||
- { id: 12, class: sreg_32_xm0 }
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: %vgpr0_vgpr1, %vgpr2_vgpr3, %sgpr30_sgpr31
|
||||
|
||||
%2 = COPY %sgpr30_sgpr31
|
||||
%1 = COPY %vgpr2_vgpr3
|
||||
%0 = COPY %vgpr0_vgpr1
|
||||
%3 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4)
|
||||
%12 = S_MOV_B32 123
|
||||
%10 = V_LSHRREV_B32_e64 16, %3, implicit %exec
|
||||
%11 = V_ADD_I32_e32 %12, killed %10, implicit-def %vcc, implicit %exec
|
||||
FLAT_STORE_DWORD %0, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4)
|
||||
%sgpr30_sgpr31 = COPY %2
|
||||
S_SETPC_B64_return %sgpr30_sgpr31
|
||||
|
||||
...
|
||||
|
||||
# GCN-LABEL: {{^}}name: trunc_shr_f32
|
||||
|
||||
# CI: [[SHIFT:%[0-9]+]] = V_LSHRREV_B32_e64 16, %{{[0-9]+}}, implicit %exec
|
||||
# CI: %{{[0-9]+}} = V_TRUNC_F32_e64 0, killed [[SHIFT]], 1, 2, implicit-def %vcc, implicit %exec
|
||||
|
||||
# VI: [[SHIFT:%[0-9]+]] = V_LSHRREV_B32_e64 16, %{{[0-9]+}}, implicit %exec
|
||||
# VI: %{{[0-9]+}} = V_TRUNC_F32_e64 0, killed [[SHIFT]], 1, 2, implicit-def %vcc, implicit %exec
|
||||
|
||||
#GFX9: %{{[0-9]+}} = V_TRUNC_F32_sdwa 0, %{{[0-9]+}}, 1, 2, 6, 0, 5, implicit %exec
|
||||
|
||||
---
|
||||
name: trunc_shr_f32
|
||||
tracksRegLiveness: true
|
||||
registers:
|
||||
- { id: 0, class: vreg_64 }
|
||||
- { id: 1, class: vreg_64 }
|
||||
- { id: 2, class: sreg_64 }
|
||||
- { id: 3, class: vgpr_32 }
|
||||
- { id: 4, class: sreg_32_xm0 }
|
||||
- { id: 5, class: sreg_32_xm0 }
|
||||
- { id: 6, class: sreg_32 }
|
||||
- { id: 7, class: sreg_32_xm0 }
|
||||
- { id: 8, class: sreg_32 }
|
||||
- { id: 9, class: vgpr_32 }
|
||||
- { id: 10, class: vgpr_32 }
|
||||
- { id: 11, class: vgpr_32 }
|
||||
body: |
|
||||
bb.0:
|
||||
liveins: %vgpr0_vgpr1, %vgpr2_vgpr3, %sgpr30_sgpr31
|
||||
|
||||
%2 = COPY %sgpr30_sgpr31
|
||||
%1 = COPY %vgpr2_vgpr3
|
||||
%0 = COPY %vgpr0_vgpr1
|
||||
%3 = FLAT_LOAD_DWORD %1, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4)
|
||||
%10 = V_LSHRREV_B32_e64 16, %3, implicit %exec
|
||||
%11 = V_TRUNC_F32_e64 0, killed %10, 1, 2, implicit-def %vcc, implicit %exec
|
||||
FLAT_STORE_DWORD %0, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (store 4)
|
||||
%sgpr30_sgpr31 = COPY %2
|
||||
S_SETPC_B64_return %sgpr30_sgpr31
|
@ -1,5 +1,6 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s
|
||||
; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SDWA -check-prefix=GCN %s
|
||||
; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SDWA -check-prefix=GCN %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=SDWA -check-prefix=GCN %s
|
||||
|
||||
; GCN-LABEL: {{^}}add_shr_i32:
|
||||
; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
|
||||
@ -72,9 +73,11 @@ entry:
|
||||
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
|
||||
; NOSDWA-NOT: v_mul_u32_u24_sdwa
|
||||
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
|
||||
; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_LO]], v[[DST_MUL_HI]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
|
||||
; GFX9: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
||||
define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
|
||||
entry:
|
||||
@ -93,12 +96,15 @@ entry:
|
||||
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; NOSDWA-NOT: v_mul_u32_u24_sdwa
|
||||
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
|
||||
; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
|
||||
; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
|
||||
; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
||||
define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) {
|
||||
entry:
|
||||
@ -117,18 +123,23 @@ entry:
|
||||
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; NOSDWA-NOT: v_mul_u32_u24_sdwa
|
||||
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL6]], v[[DST_MUL7]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL4]], v[[DST_MUL5]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
|
||||
; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
|
||||
; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
|
||||
; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
|
||||
; VI-DAG: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL6]], v[[DST_MUL7]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL4]], v[[DST_MUL5]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL2]], v[[DST_MUL3]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; VI-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL0]], v[[DST_MUL1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
|
||||
; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX9-DAG: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
||||
define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) {
|
||||
entry:
|
||||
@ -162,9 +173,12 @@ entry:
|
||||
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
|
||||
; NOSDWA-NOT: v_mul_f16_sdwa
|
||||
|
||||
; SDWA-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]]
|
||||
; VI-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; VI: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]]
|
||||
|
||||
; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
||||
define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
|
||||
entry:
|
||||
%a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
|
||||
@ -182,10 +196,13 @@ entry:
|
||||
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; NOSDWA-NOT: v_mul_f16_sdwa
|
||||
|
||||
; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
||||
; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
||||
define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) {
|
||||
entry:
|
||||
@ -204,14 +221,19 @@ entry:
|
||||
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; NOSDWA-NOT: v_mul_f16_sdwa
|
||||
|
||||
; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
||||
; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX9-DAG: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
||||
define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) {
|
||||
entry:
|
||||
@ -245,7 +267,11 @@ entry:
|
||||
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; NOSDWA-NOT: v_mul_u32_u24_sdwa
|
||||
|
||||
; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
|
||||
; VI: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
|
||||
|
||||
; GFX9-DAG: v_mul_lo_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
|
||||
; GFX9-DAG: v_mul_lo_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX9: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
|
||||
|
||||
define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) {
|
||||
entry:
|
||||
@ -264,9 +290,13 @@ entry:
|
||||
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; NOSDWA-NOT: v_mul_u32_u24_sdwa
|
||||
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa
|
||||
; VI-DAG: v_mul_u32_u24_sdwa
|
||||
; VI-DAG: v_mul_u32_u24_sdwa
|
||||
; VI-DAG: v_mul_u32_u24_sdwa
|
||||
|
||||
; GFX9-DAG: v_mul_lo_u16_sdwa
|
||||
; GFX9-DAG: v_mul_lo_u16_sdwa
|
||||
; GFX9-DAG: v_mul_lo_u16_sdwa
|
||||
|
||||
define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) {
|
||||
entry:
|
||||
@ -285,12 +315,19 @@ entry:
|
||||
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; NOSDWA-NOT: v_mul_u32_u24_sdwa
|
||||
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa
|
||||
; VI-DAG: v_mul_u32_u24_sdwa
|
||||
; VI-DAG: v_mul_u32_u24_sdwa
|
||||
; VI-DAG: v_mul_u32_u24_sdwa
|
||||
; VI-DAG: v_mul_u32_u24_sdwa
|
||||
; VI-DAG: v_mul_u32_u24_sdwa
|
||||
; VI-DAG: v_mul_u32_u24_sdwa
|
||||
|
||||
; GFX9-DAG: v_mul_lo_u16_sdwa
|
||||
; GFX9-DAG: v_mul_lo_u16_sdwa
|
||||
; GFX9-DAG: v_mul_lo_u16_sdwa
|
||||
; GFX9-DAG: v_mul_lo_u16_sdwa
|
||||
; GFX9-DAG: v_mul_lo_u16_sdwa
|
||||
; GFX9-DAG: v_mul_lo_u16_sdwa
|
||||
|
||||
define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) {
|
||||
entry:
|
||||
@ -330,8 +367,11 @@ entry:
|
||||
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
|
||||
; NOSDWA-NOT: v_mac_f16_sdwa
|
||||
|
||||
; SDWA: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]]
|
||||
; VI: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]]
|
||||
|
||||
; GFX9: v_pk_mul_f16 v[[DST_MUL:[0-9]+]], v{{[0-9]+}}, v[[SRC:[0-9]+]]
|
||||
; GFX9: v_pk_add_f16 v{{[0-9]+}}, v[[DST_MUL]], v[[SRC]]
|
||||
|
||||
define amdgpu_kernel void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
|
||||
entry:
|
||||
@ -345,10 +385,13 @@ entry:
|
||||
|
||||
; GCN-LABEL: {{^}}immediate_mul_v2i16:
|
||||
; NOSDWA-NOT: v_mul_u32_u24_sdwa
|
||||
; SDWA-DAG: v_mov_b32_e32 v[[M321:[0-9]+]], 0x141
|
||||
; SDWA-DAG: v_mov_b32_e32 v[[M123:[0-9]+]], 0x7b
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M123]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; SDWA-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M321]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-DAG: v_mov_b32_e32 v[[M321:[0-9]+]], 0x141
|
||||
; VI-DAG: v_mov_b32_e32 v[[M123:[0-9]+]], 0x7b
|
||||
; VI-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M123]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
; VI-DAG: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[M321]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
|
||||
; GFX9: s_mov_b32 s[[IMM:[0-9]+]], 0x141007b
|
||||
; GFX9: v_pk_mul_lo_u16 v{{[0-9]+}}, v{{[0-9]+}}, s[[IMM]]
|
||||
|
||||
define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
|
||||
entry:
|
||||
@ -367,7 +410,10 @@ entry:
|
||||
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; NOSDWA-NOT: v_mul_u32_u24_sdwa
|
||||
|
||||
; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
|
||||
; GFX9: v_pk_mul_lo_u16 v[[DST1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX9: v_pk_mul_lo_u16 v{{[0-9]+}}, v[[DST1]], v{{[0-9]+}}
|
||||
|
||||
define amdgpu_kernel void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
|
||||
entry:
|
||||
@ -382,7 +428,9 @@ entry:
|
||||
; GCN-LABEL: {{^}}add_bb_v2i16:
|
||||
; NOSDWA-NOT: v_add_i32_sdwa
|
||||
|
||||
; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
|
||||
; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
||||
define amdgpu_kernel void @add_bb_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
|
||||
entry:
|
||||
@ -408,9 +456,11 @@ store_label:
|
||||
; NOSDWA-NOT: v_and_b32_sdwa
|
||||
; NOSDWA-NOT: v_or_b32_sdwa
|
||||
|
||||
; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GFX9-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
|
||||
; SDWA-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; GFX9-DAG: v_and_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||
; SDWA-DAG: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
|
||||
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
|
||||
|
||||
|
@ -1,4 +1,5 @@
|
||||
# RUN: llc -march=amdgcn -mcpu=fiji -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=GCN %s
|
||||
# RUN: llc -march=amdgcn -mcpu=fiji -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
|
||||
# RUN: llc -march=amdgcn -mcpu=gfx900 -start-before si-peephole-sdwa -o - %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
|
||||
|
||||
# GCN-LABEL: {{^}}sdwa_imm_operand:
|
||||
# GCN: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2
|
||||
@ -8,11 +9,17 @@
|
||||
# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
|
||||
|
||||
# GCN-LABEL: {{^}}sdwa_sgpr_operand:
|
||||
# GCN: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2
|
||||
# GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2
|
||||
# GCN: BB1_1:
|
||||
# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
|
||||
# GCN: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
|
||||
# VI: v_mov_b32_e32 v[[SHIFT:[0-9]+]], 2
|
||||
# VI-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2
|
||||
# VI: BB1_1:
|
||||
# VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
|
||||
# VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
|
||||
|
||||
# GFX9: s_mov_b32 s[[SHIFT:[0-9]+]], 2
|
||||
# GFX9-NOT: v_mov_b32_e32 v{{[0-9]+}}, 2
|
||||
# GFX9: BB1_1:
|
||||
# GFX9: v_lshlrev_b32_sdwa v{{[0-9]+}}, s[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
|
||||
# GFX9: v_lshlrev_b32_sdwa v{{[0-9]+}}, s[[SHIFT]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
|
||||
|
||||
--- |
|
||||
; ModuleID = 'sdwa-scalar-ops.opt.ll'
|
||||
|
Loading…
x
Reference in New Issue
Block a user