mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 12:41:49 +01:00
[AMDGPU] Add gfx1030 target
Differential Revision: https://reviews.llvm.org/D81886
This commit is contained in:
parent
b89b9a9eed
commit
9363e58d6d
@ -263,6 +263,15 @@ names from both the *Processor* and *Alternative Processor* can be used.
|
||||
.. TODO::
|
||||
Add product
|
||||
names.
|
||||
``gfx1030`` ``amdgcn`` dGPU - xnack *TBA*
|
||||
[off]
|
||||
- wavefrontsize64
|
||||
[off]
|
||||
- cumode
|
||||
[off]
|
||||
.. TODO
|
||||
Add product
|
||||
names.
|
||||
=========== =============== ============ ===== ================= ======= ======================
|
||||
|
||||
.. _amdgpu-target-features:
|
||||
@ -806,6 +815,7 @@ The AMDGPU backend uses the following ELF header:
|
||||
``EF_AMDGPU_MACH_AMDGCN_GFX1010`` 0x033 ``gfx1010``
|
||||
``EF_AMDGPU_MACH_AMDGCN_GFX1011`` 0x034 ``gfx1011``
|
||||
``EF_AMDGPU_MACH_AMDGCN_GFX1012`` 0x035 ``gfx1012``
|
||||
``EF_AMDGPU_MACH_AMDGCN_GFX1030`` 0x036 ``gfx1030``
|
||||
================================= ========== =============================
|
||||
|
||||
Sections
|
||||
|
@ -706,6 +706,7 @@ enum : unsigned {
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1010 = 0x033,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1011 = 0x034,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1012 = 0x035,
|
||||
EF_AMDGPU_MACH_AMDGCN_GFX1030 = 0x036,
|
||||
|
||||
// Reserved for AMDGCN-based processors.
|
||||
EF_AMDGPU_MACH_AMDGCN_RESERVED0 = 0x027,
|
||||
@ -713,7 +714,7 @@ enum : unsigned {
|
||||
|
||||
// First/last AMDGCN-based processors.
|
||||
EF_AMDGPU_MACH_AMDGCN_FIRST = EF_AMDGPU_MACH_AMDGCN_GFX600,
|
||||
EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX1012,
|
||||
EF_AMDGPU_MACH_AMDGCN_LAST = EF_AMDGPU_MACH_AMDGCN_GFX1030,
|
||||
|
||||
// Indicates if the "xnack" target feature is enabled for all code contained
|
||||
// in the object.
|
||||
|
@ -765,6 +765,11 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
|
||||
"STORE_MIP", [], [AMDGPUArg<llvm_anyfloat_ty, "vdata">],
|
||||
[IntrWriteMem], [SDNPMemOperand], 1>;
|
||||
|
||||
defm int_amdgcn_image_msaa_load
|
||||
: AMDGPUImageDimIntrinsicsAll<"MSAA_LOAD", [llvm_any_ty], [], [IntrReadMem],
|
||||
[SDNPMemOperand]>,
|
||||
AMDGPUImageDMaskIntrinsic;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
// sample and getlod intrinsics
|
||||
//////////////////////////////////////////////////////////////////////////
|
||||
@ -1142,6 +1147,7 @@ def int_amdgcn_buffer_atomic_cmpswap : Intrinsic<
|
||||
[ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
|
||||
AMDGPURsrcIntrinsic<2, 0>;
|
||||
|
||||
def int_amdgcn_buffer_atomic_csub : AMDGPUBufferAtomic;
|
||||
} // defset AMDGPUBufferIntrinsics
|
||||
|
||||
// Uses that do not set the done bit should set IntrWriteMem on the
|
||||
@ -1603,6 +1609,14 @@ def int_amdgcn_s_get_waveid_in_workgroup :
|
||||
GCCBuiltin<"__builtin_amdgcn_s_get_waveid_in_workgroup">,
|
||||
Intrinsic<[llvm_i32_ty], [], [IntrReadMem, IntrInaccessibleMemOnly]>;
|
||||
|
||||
class AMDGPUGlobalAtomicRtn<LLVMType vt> : Intrinsic <
|
||||
[vt],
|
||||
[llvm_anyptr_ty, // vaddr
|
||||
vt], // vdata(VGPR)
|
||||
[IntrArgMemOnly, NoCapture<ArgIndex<0>>], "", [SDNPMemOperand]>;
|
||||
|
||||
def int_amdgcn_global_atomic_csub : AMDGPUGlobalAtomicRtn<llvm_i32_ty>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Deep learning intrinsics.
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -84,9 +84,10 @@ enum GPUKind : uint32_t {
|
||||
GK_GFX1010 = 71,
|
||||
GK_GFX1011 = 72,
|
||||
GK_GFX1012 = 73,
|
||||
GK_GFX1030 = 75,
|
||||
|
||||
GK_AMDGCN_FIRST = GK_GFX600,
|
||||
GK_AMDGCN_LAST = GK_GFX1012,
|
||||
GK_AMDGCN_LAST = GK_GFX1030,
|
||||
};
|
||||
|
||||
/// Instruction set architecture version.
|
||||
|
@ -429,6 +429,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
|
||||
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1010, EF_AMDGPU_MACH);
|
||||
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1011, EF_AMDGPU_MACH);
|
||||
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1012, EF_AMDGPU_MACH);
|
||||
BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1030, EF_AMDGPU_MACH);
|
||||
BCase(EF_AMDGPU_XNACK);
|
||||
BCase(EF_AMDGPU_SRAM_ECC);
|
||||
break;
|
||||
|
@ -62,7 +62,7 @@ constexpr GPUInfo R600GPUs[26] = {
|
||||
|
||||
// This table should be sorted by the value of GPUKind
|
||||
// Don't bother listing the implicitly true features
|
||||
constexpr GPUInfo AMDGCNGPUs[37] = {
|
||||
constexpr GPUInfo AMDGCNGPUs[38] = {
|
||||
// Name Canonical Kind Features
|
||||
// Name
|
||||
{{"gfx600"}, {"gfx600"}, GK_GFX600, FEATURE_FAST_FMA_F32},
|
||||
@ -102,6 +102,7 @@ constexpr GPUInfo AMDGCNGPUs[37] = {
|
||||
{{"gfx1010"}, {"gfx1010"}, GK_GFX1010, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
|
||||
{{"gfx1011"}, {"gfx1011"}, GK_GFX1011, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
|
||||
{{"gfx1012"}, {"gfx1012"}, GK_GFX1012, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
|
||||
{{"gfx1030"}, {"gfx1030"}, GK_GFX1030, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
|
||||
};
|
||||
|
||||
const GPUInfo *getArchEntry(AMDGPU::GPUKind AK, ArrayRef<GPUInfo> Table) {
|
||||
@ -203,6 +204,7 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) {
|
||||
case GK_GFX1010: return {10, 1, 0};
|
||||
case GK_GFX1011: return {10, 1, 1};
|
||||
case GK_GFX1012: return {10, 1, 2};
|
||||
case GK_GFX1030: return {10, 3, 0};
|
||||
default: return {0, 0, 0};
|
||||
}
|
||||
}
|
||||
|
@ -260,6 +260,12 @@ def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts",
|
||||
"Additional instructions for GFX10+"
|
||||
>;
|
||||
|
||||
def FeatureGFX10_3Insts : SubtargetFeature<"gfx10-3-insts",
|
||||
"GFX10_3Insts",
|
||||
"true",
|
||||
"Additional instructions for GFX10.3"
|
||||
>;
|
||||
|
||||
def FeatureGFX7GFX8GFX9Insts : SubtargetFeature<"gfx7-gfx8-gfx9-insts",
|
||||
"GFX7GFX8GFX9Insts",
|
||||
"true",
|
||||
@ -387,6 +393,12 @@ def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding",
|
||||
"Support NSA encoding for image instructions"
|
||||
>;
|
||||
|
||||
def FeatureGFX10_BEncoding : SubtargetFeature<"gfx10_b-encoding",
|
||||
"GFX10_BEncoding",
|
||||
"true",
|
||||
"Encoding format GFX10_B"
|
||||
>;
|
||||
|
||||
def FeatureIntClamp : SubtargetFeature<"int-clamp-insts",
|
||||
"HasIntClamp",
|
||||
"true",
|
||||
@ -485,6 +497,30 @@ def FeatureVscnt : SubtargetFeature<"vscnt",
|
||||
"Has separate store vscnt counter"
|
||||
>;
|
||||
|
||||
def FeatureGetWaveIdInst : SubtargetFeature<"get-wave-id-inst",
|
||||
"HasGetWaveIdInst",
|
||||
"true",
|
||||
"Has s_get_waveid_in_workgroup instruction"
|
||||
>;
|
||||
|
||||
def FeatureSMemTimeInst : SubtargetFeature<"s-memtime-inst",
|
||||
"HasSMemTimeInst",
|
||||
"true",
|
||||
"Has s_memtime instruction"
|
||||
>;
|
||||
|
||||
def FeatureMadMacF32Insts : SubtargetFeature<"mad-mac-f32-insts",
|
||||
"HasMadMacF32Insts",
|
||||
"true",
|
||||
"Has v_mad_f32/v_mac_f32/v_madak_f32/v_madmk_f32 instructions"
|
||||
>;
|
||||
|
||||
def FeatureDsSrc2Insts : SubtargetFeature<"ds-src2-insts",
|
||||
"HasDsSrc2Insts",
|
||||
"true",
|
||||
"Has ds_*_src2 instructions"
|
||||
>;
|
||||
|
||||
def FeatureRegisterBanking : SubtargetFeature<"register-banking",
|
||||
"HasRegisterBanking",
|
||||
"true",
|
||||
@ -617,9 +653,10 @@ class GCNSubtargetFeatureGeneration <string Value,
|
||||
def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
|
||||
"southern-islands",
|
||||
[FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128,
|
||||
FeatureWavefrontSize64,
|
||||
FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange,
|
||||
FeatureDoesNotSupportSRAMECC, FeatureDoesNotSupportXNACK]
|
||||
FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
|
||||
FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
|
||||
FeatureTrigReducedRange, FeatureDoesNotSupportSRAMECC,
|
||||
FeatureDoesNotSupportXNACK]
|
||||
>;
|
||||
|
||||
def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
|
||||
@ -627,7 +664,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
|
||||
[FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
|
||||
FeatureWavefrontSize64, FeatureFlatAddressSpace,
|
||||
FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
|
||||
FeatureGFX7GFX8GFX9Insts, FeatureDoesNotSupportSRAMECC]
|
||||
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
|
||||
FeatureDsSrc2Insts, FeatureDoesNotSupportSRAMECC]
|
||||
>;
|
||||
|
||||
def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
|
||||
@ -638,8 +676,9 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
|
||||
FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
|
||||
FeatureScalarStores, FeatureInv2PiInlineImm,
|
||||
FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP,
|
||||
FeatureIntClamp, FeatureTrigReducedRange, FeatureDoesNotSupportSRAMECC,
|
||||
FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts, FeatureFastDenormalF32
|
||||
FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts,
|
||||
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
|
||||
FeatureDsSrc2Insts, FeatureDoesNotSupportSRAMECC, FeatureFastDenormalF32
|
||||
]
|
||||
>;
|
||||
|
||||
@ -655,7 +694,9 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
|
||||
FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
|
||||
FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts,
|
||||
FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
|
||||
FeatureFastDenormalF32]
|
||||
FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts,
|
||||
FeatureFastDenormalF32
|
||||
]
|
||||
>;
|
||||
|
||||
def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
|
||||
@ -843,6 +884,10 @@ def FeatureISAVersion10_1_0 : FeatureSet<
|
||||
FeatureScalarStores,
|
||||
FeatureScalarAtomics,
|
||||
FeatureScalarFlatScratchInsts,
|
||||
FeatureGetWaveIdInst,
|
||||
FeatureSMemTimeInst,
|
||||
FeatureMadMacF32Insts,
|
||||
FeatureDsSrc2Insts,
|
||||
FeatureLdsMisalignedBug,
|
||||
FeatureDoesNotSupportXNACK,
|
||||
FeatureCodeObjectV3])>;
|
||||
@ -861,6 +906,10 @@ def FeatureISAVersion10_1_1 : FeatureSet<
|
||||
FeatureScalarStores,
|
||||
FeatureScalarAtomics,
|
||||
FeatureScalarFlatScratchInsts,
|
||||
FeatureGetWaveIdInst,
|
||||
FeatureSMemTimeInst,
|
||||
FeatureMadMacF32Insts,
|
||||
FeatureDsSrc2Insts,
|
||||
FeatureDoesNotSupportXNACK,
|
||||
FeatureCodeObjectV3])>;
|
||||
|
||||
@ -878,10 +927,29 @@ def FeatureISAVersion10_1_2 : FeatureSet<
|
||||
FeatureScalarStores,
|
||||
FeatureScalarAtomics,
|
||||
FeatureScalarFlatScratchInsts,
|
||||
FeatureGetWaveIdInst,
|
||||
FeatureSMemTimeInst,
|
||||
FeatureMadMacF32Insts,
|
||||
FeatureDsSrc2Insts,
|
||||
FeatureLdsMisalignedBug,
|
||||
FeatureDoesNotSupportXNACK,
|
||||
FeatureCodeObjectV3])>;
|
||||
|
||||
def FeatureISAVersion10_3_0 : FeatureSet<
|
||||
[FeatureGFX10,
|
||||
FeatureGFX10_BEncoding,
|
||||
FeatureGFX10_3Insts,
|
||||
FeatureLDSBankCount32,
|
||||
FeatureDLInsts,
|
||||
FeatureDot1Insts,
|
||||
FeatureDot2Insts,
|
||||
FeatureDot5Insts,
|
||||
FeatureDot6Insts,
|
||||
FeatureNSAEncoding,
|
||||
FeatureWavefrontSize32,
|
||||
FeatureDoesNotSupportXNACK,
|
||||
FeatureCodeObjectV3]>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def AMDGPUInstrInfo : InstrInfo {
|
||||
@ -1039,6 +1107,9 @@ def HasScalarFlatScratchInsts : Predicate<"Subtarget->hasScalarFlatScratchInsts(
|
||||
def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,
|
||||
AssemblerPredicate<(all_of FeatureGFX9Insts)>;
|
||||
|
||||
def HasGFX10_BEncoding : Predicate<"Subtarget->hasGFX10_BEncoding()">,
|
||||
AssemblerPredicate<(all_of FeatureGFX10_BEncoding)>;
|
||||
|
||||
def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">,
|
||||
AssemblerPredicate<(all_of FeatureUnpackedD16VMem)>;
|
||||
def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
|
||||
@ -1148,15 +1219,32 @@ def HasDot5Insts : Predicate<"Subtarget->hasDot5Insts()">,
|
||||
def HasDot6Insts : Predicate<"Subtarget->hasDot6Insts()">,
|
||||
AssemblerPredicate<(all_of FeatureDot6Insts)>;
|
||||
|
||||
def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">,
|
||||
AssemblerPredicate<(all_of FeatureGetWaveIdInst)>;
|
||||
|
||||
def HasMAIInsts : Predicate<"Subtarget->hasMAIInsts()">,
|
||||
AssemblerPredicate<(all_of FeatureMAIInsts)>;
|
||||
|
||||
def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">,
|
||||
AssemblerPredicate<(all_of FeatureSMemTimeInst)>;
|
||||
|
||||
def HasNoSMemTimeInst : Predicate<"!Subtarget->hasSMemTimeInst()">;
|
||||
|
||||
def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">,
|
||||
AssemblerPredicate<(all_of FeaturePkFmacF16Inst)>;
|
||||
|
||||
def HasMadMacF32Insts : Predicate<"Subtarget->hasMadMacF32Insts()">,
|
||||
AssemblerPredicate<(all_of FeatureMadMacF32Insts)>;
|
||||
|
||||
def HasAtomicFaddInsts : Predicate<"Subtarget->hasAtomicFaddInsts()">,
|
||||
AssemblerPredicate<(all_of FeatureAtomicFaddInsts)>;
|
||||
|
||||
def HasNoMadMacF32Insts : Predicate<"!Subtarget->hasMadMacF32Insts()">,
|
||||
AssemblerPredicate<(all_of (not FeatureMadMacF32Insts))>;
|
||||
|
||||
def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">,
|
||||
AssemblerPredicate<(all_of FeatureDsSrc2Insts)>;
|
||||
|
||||
def HasOffset3fBug : Predicate<"!Subtarget->hasOffset3fBug()">,
|
||||
AssemblerPredicate<(all_of FeatureOffset3fBug)>;
|
||||
|
||||
|
@ -923,7 +923,10 @@ Value *AMDGPUCodeGenPrepare::expandDivRem24Impl(IRBuilder<> &Builder,
|
||||
Value *FQNeg = Builder.CreateFNeg(FQ);
|
||||
|
||||
// float fr = mad(fqneg, fb, fa);
|
||||
Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz,
|
||||
auto FMAD = !ST->hasMadMacF32Insts()
|
||||
? Intrinsic::fma
|
||||
: (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
|
||||
Value *FR = Builder.CreateIntrinsic(FMAD,
|
||||
{FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
|
||||
|
||||
// int iq = (int)fq;
|
||||
|
@ -716,7 +716,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
|
||||
(Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
|
||||
Opc == ISD::ATOMIC_LOAD_FADD ||
|
||||
Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
|
||||
Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) {
|
||||
Opc == AMDGPUISD::ATOMIC_LOAD_FMAX ||
|
||||
Opc == AMDGPUISD::ATOMIC_LOAD_CSUB)) {
|
||||
N = glueCopyToM0LDSInit(N);
|
||||
SelectCode(N);
|
||||
return;
|
||||
|
@ -1699,10 +1699,11 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
|
||||
const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
|
||||
|
||||
// float fr = mad(fqneg, fb, fa);
|
||||
unsigned OpCode = !MFI->getMode().allFP32Denormals() ?
|
||||
unsigned OpCode = !Subtarget->hasMadMacF32Insts() ?
|
||||
(unsigned)ISD::FMA :
|
||||
!MFI->getMode().allFP32Denormals() ?
|
||||
(unsigned)ISD::FMAD :
|
||||
(unsigned)AMDGPUISD::FMAD_FTZ;
|
||||
|
||||
SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
|
||||
|
||||
// int iq = (int)fq;
|
||||
@ -1785,11 +1786,12 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
|
||||
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
|
||||
|
||||
// Compute denominator reciprocal.
|
||||
unsigned FMAD = !MFI->getMode().allFP32Denormals() ?
|
||||
unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
|
||||
(unsigned)ISD::FMA :
|
||||
!MFI->getMode().allFP32Denormals() ?
|
||||
(unsigned)ISD::FMAD :
|
||||
(unsigned)AMDGPUISD::FMAD_FTZ;
|
||||
|
||||
|
||||
SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
|
||||
SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
|
||||
SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
|
||||
@ -4394,6 +4396,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
NODE_NAME_CASE(ATOMIC_DEC)
|
||||
NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
|
||||
NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
|
||||
NODE_NAME_CASE(ATOMIC_LOAD_CSUB)
|
||||
NODE_NAME_CASE(BUFFER_LOAD)
|
||||
NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
|
||||
NODE_NAME_CASE(BUFFER_LOAD_USHORT)
|
||||
@ -4420,6 +4423,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
NODE_NAME_CASE(BUFFER_ATOMIC_INC)
|
||||
NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
|
||||
NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
|
||||
NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
|
||||
NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
|
||||
NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD)
|
||||
NODE_NAME_CASE(ATOMIC_PK_FADD)
|
||||
|
@ -509,6 +509,7 @@ enum NodeType : unsigned {
|
||||
ATOMIC_DEC,
|
||||
ATOMIC_LOAD_FMIN,
|
||||
ATOMIC_LOAD_FMAX,
|
||||
ATOMIC_LOAD_CSUB,
|
||||
BUFFER_LOAD,
|
||||
BUFFER_LOAD_UBYTE,
|
||||
BUFFER_LOAD_USHORT,
|
||||
@ -535,6 +536,7 @@ enum NodeType : unsigned {
|
||||
BUFFER_ATOMIC_INC,
|
||||
BUFFER_ATOMIC_DEC,
|
||||
BUFFER_ATOMIC_CMPSWAP,
|
||||
BUFFER_ATOMIC_CSUB,
|
||||
BUFFER_ATOMIC_FADD,
|
||||
BUFFER_ATOMIC_PK_FADD,
|
||||
ATOMIC_PK_FADD,
|
||||
|
@ -198,6 +198,7 @@ def : SourceOfDivergence<int_r600_read_tidig_y>;
|
||||
def : SourceOfDivergence<int_r600_read_tidig_z>;
|
||||
def : SourceOfDivergence<int_amdgcn_atomic_inc>;
|
||||
def : SourceOfDivergence<int_amdgcn_atomic_dec>;
|
||||
def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
|
||||
def : SourceOfDivergence<int_amdgcn_ds_fadd>;
|
||||
def : SourceOfDivergence<int_amdgcn_ds_fmin>;
|
||||
def : SourceOfDivergence<int_amdgcn_ds_fmax>;
|
||||
@ -238,6 +239,7 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>;
|
||||
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>;
|
||||
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>;
|
||||
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
|
||||
def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>;
|
||||
def : SourceOfDivergence<int_amdgcn_ps_live>;
|
||||
def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
|
||||
def : SourceOfDivergence<int_amdgcn_ds_ordered_add>;
|
||||
|
@ -153,6 +153,8 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
|
||||
TargetTriple(TT),
|
||||
Has16BitInsts(false),
|
||||
HasMadMixInsts(false),
|
||||
HasMadMacF32Insts(false),
|
||||
HasDsSrc2Insts(false),
|
||||
HasSDWA(false),
|
||||
HasVOP3PInsts(false),
|
||||
HasMulI24(true),
|
||||
@ -205,6 +207,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
|
||||
GFX8Insts(false),
|
||||
GFX9Insts(false),
|
||||
GFX10Insts(false),
|
||||
GFX10_3Insts(false),
|
||||
GFX7GFX8GFX9Insts(false),
|
||||
SGPRInitBug(false),
|
||||
HasSMemRealTime(false),
|
||||
@ -225,6 +228,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
|
||||
HasGFX10A16(false),
|
||||
HasG16(false),
|
||||
HasNSAEncoding(false),
|
||||
GFX10_BEncoding(false),
|
||||
HasDLInsts(false),
|
||||
HasDot1Insts(false),
|
||||
HasDot2Insts(false),
|
||||
@ -239,6 +243,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
|
||||
DoesNotSupportSRAMECC(false),
|
||||
HasNoSdstCMPX(false),
|
||||
HasVscnt(false),
|
||||
HasGetWaveIdInst(false),
|
||||
HasSMemTimeInst(false),
|
||||
HasRegisterBanking(false),
|
||||
HasVOP3Literal(false),
|
||||
HasNoDataDepHazard(false),
|
||||
|
@ -67,6 +67,8 @@ private:
|
||||
protected:
|
||||
bool Has16BitInsts;
|
||||
bool HasMadMixInsts;
|
||||
bool HasMadMacF32Insts;
|
||||
bool HasDsSrc2Insts;
|
||||
bool HasSDWA;
|
||||
bool HasVOP3PInsts;
|
||||
bool HasMulI24;
|
||||
@ -140,6 +142,10 @@ public:
|
||||
return isAmdHsaOS() || isMesaKernel(F);
|
||||
}
|
||||
|
||||
bool isGCN() const {
|
||||
return TargetTriple.getArch() == Triple::amdgcn;
|
||||
}
|
||||
|
||||
bool has16BitInsts() const {
|
||||
return Has16BitInsts;
|
||||
}
|
||||
@ -148,6 +154,14 @@ public:
|
||||
return HasMadMixInsts;
|
||||
}
|
||||
|
||||
bool hasMadMacF32Insts() const {
|
||||
return HasMadMacF32Insts || !isGCN();
|
||||
}
|
||||
|
||||
bool hasDsSrc2Insts() const {
|
||||
return HasDsSrc2Insts;
|
||||
}
|
||||
|
||||
bool hasSDWA() const {
|
||||
return HasSDWA;
|
||||
}
|
||||
@ -325,6 +339,7 @@ protected:
|
||||
bool GFX8Insts;
|
||||
bool GFX9Insts;
|
||||
bool GFX10Insts;
|
||||
bool GFX10_3Insts;
|
||||
bool GFX7GFX8GFX9Insts;
|
||||
bool SGPRInitBug;
|
||||
bool HasSMemRealTime;
|
||||
@ -345,6 +360,7 @@ protected:
|
||||
bool HasGFX10A16;
|
||||
bool HasG16;
|
||||
bool HasNSAEncoding;
|
||||
bool GFX10_BEncoding;
|
||||
bool HasDLInsts;
|
||||
bool HasDot1Insts;
|
||||
bool HasDot2Insts;
|
||||
@ -359,6 +375,8 @@ protected:
|
||||
bool DoesNotSupportSRAMECC;
|
||||
bool HasNoSdstCMPX;
|
||||
bool HasVscnt;
|
||||
bool HasGetWaveIdInst;
|
||||
bool HasSMemTimeInst;
|
||||
bool HasRegisterBanking;
|
||||
bool HasVOP3Literal;
|
||||
bool HasNoDataDepHazard;
|
||||
@ -721,6 +739,14 @@ public:
|
||||
return ScalarFlatScratchInsts;
|
||||
}
|
||||
|
||||
bool hasGlobalAddTidInsts() const {
|
||||
return GFX10_BEncoding;
|
||||
}
|
||||
|
||||
bool hasAtomicCSub() const {
|
||||
return GFX10_BEncoding;
|
||||
}
|
||||
|
||||
bool hasMultiDwordFlatScratchAddressing() const {
|
||||
return getGeneration() >= GFX9;
|
||||
}
|
||||
@ -854,6 +880,14 @@ public:
|
||||
return HasVscnt;
|
||||
}
|
||||
|
||||
bool hasGetWaveIdInst() const {
|
||||
return HasGetWaveIdInst;
|
||||
}
|
||||
|
||||
bool hasSMemTimeInst() const {
|
||||
return HasSMemTimeInst;
|
||||
}
|
||||
|
||||
bool hasRegisterBanking() const {
|
||||
return HasRegisterBanking;
|
||||
}
|
||||
@ -972,6 +1006,14 @@ public:
|
||||
return HasNSAEncoding;
|
||||
}
|
||||
|
||||
bool hasGFX10_BEncoding() const {
|
||||
return GFX10_BEncoding;
|
||||
}
|
||||
|
||||
bool hasGFX10_3Insts() const {
|
||||
return GFX10_3Insts;
|
||||
}
|
||||
|
||||
bool hasMadF16() const;
|
||||
|
||||
bool enableSIScheduler() const {
|
||||
|
@ -1188,6 +1188,10 @@ public:
|
||||
return AMDGPU::isGFX10(getSTI());
|
||||
}
|
||||
|
||||
bool isGFX10_BEncoding() const {
|
||||
return AMDGPU::isGFX10_BEncoding(getSTI());
|
||||
}
|
||||
|
||||
bool hasInv2PiInlineImm() const {
|
||||
return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm];
|
||||
}
|
||||
|
@ -1003,6 +1003,11 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
|
||||
"buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global_64
|
||||
>;
|
||||
|
||||
let SubtargetPredicate = HasGFX10_BEncoding in
|
||||
defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics_RTN <
|
||||
"buffer_atomic_csub", VGPR_32, i32, atomic_csub_global_32
|
||||
>;
|
||||
|
||||
let SubtargetPredicate = isGFX8GFX9 in {
|
||||
def BUFFER_STORE_LDS_DWORD : MUBUF_Pseudo_Store_Lds <"buffer_store_lds_dword">;
|
||||
}
|
||||
@ -1372,6 +1377,7 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_or, i32, "BUFFER_ATOMIC_OR">;
|
||||
defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i32, "BUFFER_ATOMIC_XOR">;
|
||||
defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i32, "BUFFER_ATOMIC_INC">;
|
||||
defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i32, "BUFFER_ATOMIC_DEC">;
|
||||
defm : BufferAtomicPatterns<SIbuffer_atomic_csub, i32, "BUFFER_ATOMIC_CSUB">;
|
||||
defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i64, "BUFFER_ATOMIC_SWAP_X2">;
|
||||
defm : BufferAtomicPatterns<SIbuffer_atomic_add, i64, "BUFFER_ATOMIC_ADD_X2">;
|
||||
defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i64, "BUFFER_ATOMIC_SUB_X2">;
|
||||
@ -1879,8 +1885,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
|
||||
def _LDS_BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
|
||||
MUBUFLdsTable<1, NAME # "_BOTHEN_gfx10">;
|
||||
}
|
||||
multiclass MUBUF_Real_Atomics_gfx10<bits<8> op> :
|
||||
MUBUF_Real_AllAddr_gfx10<op> {
|
||||
multiclass MUBUF_Real_Atomics_RTN_gfx10<bits<8> op> {
|
||||
def _BOTHEN_RTN_gfx10 :
|
||||
MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
|
||||
def _IDXEN_RTN_gfx10 :
|
||||
@ -1890,6 +1895,8 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
|
||||
def _OFFSET_RTN_gfx10 :
|
||||
MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
|
||||
}
|
||||
multiclass MUBUF_Real_Atomics_gfx10<bits<8> op> :
|
||||
MUBUF_Real_AllAddr_gfx10<op>, MUBUF_Real_Atomics_RTN_gfx10<op>;
|
||||
} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
|
||||
|
||||
defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x019>;
|
||||
@ -2054,6 +2061,8 @@ defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>;
|
||||
defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>;
|
||||
defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>;
|
||||
|
||||
defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomics_RTN_gfx10<0x034>;
|
||||
|
||||
defm BUFFER_WBINVL1_SC : MUBUF_Real_gfx6<0x070>;
|
||||
defm BUFFER_WBINVL1_VOL : MUBUF_Real_gfx7<0x070>;
|
||||
def BUFFER_WBINVL1_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<0x071, BUFFER_WBINVL1>;
|
||||
|
@ -505,6 +505,7 @@ def DS_GWS_SEMA_P : DS_GWS_0D<"ds_gws_sema_p">;
|
||||
def DS_GWS_BARRIER : DS_GWS_1D<"ds_gws_barrier">;
|
||||
}
|
||||
|
||||
let SubtargetPredicate = HasDsSrc2Insts in {
|
||||
def DS_ADD_SRC2_U32 : DS_1A<"ds_add_src2_u32">;
|
||||
def DS_SUB_SRC2_U32 : DS_1A<"ds_sub_src2_u32">;
|
||||
def DS_RSUB_SRC2_U32 : DS_1A<"ds_rsub_src2_u32">;
|
||||
@ -537,6 +538,7 @@ def DS_MAX_SRC2_F64 : DS_1A<"ds_max_src2_f64">;
|
||||
|
||||
def DS_WRITE_SRC2_B32 : DS_1A<"ds_write_src2_b32">;
|
||||
def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">;
|
||||
} // End SubtargetPredicate = HasDsSrc2Insts
|
||||
|
||||
let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in {
|
||||
def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, SwizzleImm>;
|
||||
@ -619,7 +621,7 @@ def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
|
||||
|
||||
} // let SubtargetPredicate = isGFX8Plus
|
||||
|
||||
let SubtargetPredicate = HasLDSFPAtomics in {
|
||||
let SubtargetPredicate = HasLDSFPAtomics, OtherPredicates = [HasDsSrc2Insts] in {
|
||||
def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
|
||||
}
|
||||
|
||||
|
@ -296,6 +296,18 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
|
||||
if (Bytes.size() >= 8) {
|
||||
const uint64_t QW = eatBytes<uint64_t>(Bytes);
|
||||
|
||||
if (STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]) {
|
||||
Res = tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address);
|
||||
if (Res) {
|
||||
if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8)
|
||||
== -1)
|
||||
break;
|
||||
if (convertDPP8Inst(MI) == MCDisassembler::Success)
|
||||
break;
|
||||
MI = MCInst(); // clear
|
||||
}
|
||||
}
|
||||
|
||||
Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address);
|
||||
if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
|
||||
break;
|
||||
@ -345,6 +357,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
|
||||
Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address);
|
||||
if (Res) break;
|
||||
|
||||
if (STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]) {
|
||||
Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address);
|
||||
if (Res) break;
|
||||
}
|
||||
|
||||
Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address);
|
||||
if (Res) break;
|
||||
|
||||
|
@ -183,6 +183,25 @@ multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit Ha
|
||||
}
|
||||
}
|
||||
|
||||
class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterClass regClass,
|
||||
bit HasTiedOutput = 0, bit HasSignedOffset = 0> : FLAT_Pseudo<
|
||||
opName,
|
||||
(outs regClass:$vdst),
|
||||
!con((ins SReg_64:$saddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc),
|
||||
!if(HasTiedOutput, (ins regClass:$vdst_in), (ins))),
|
||||
" $vdst, $saddr$offset$glc$slc$dlc"> {
|
||||
let is_flat_global = 1;
|
||||
let has_data = 0;
|
||||
let mayLoad = 1;
|
||||
let has_vaddr = 0;
|
||||
let has_saddr = 1;
|
||||
let enabled_saddr = 1;
|
||||
let maybeAtomic = 1;
|
||||
|
||||
let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
|
||||
let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
|
||||
}
|
||||
|
||||
multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
|
||||
let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
|
||||
def "" : FLAT_Store_Pseudo<opName, regClass, 1>,
|
||||
@ -192,6 +211,24 @@ multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
|
||||
}
|
||||
}
|
||||
|
||||
class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass,
|
||||
bit HasSignedOffset = 0> : FLAT_Pseudo<
|
||||
opName,
|
||||
(outs),
|
||||
!con(
|
||||
(ins vdataClass:$vdata, SReg_64:$saddr),
|
||||
(ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
|
||||
" $vdata, $saddr$offset$glc$slc$dlc"> {
|
||||
let is_flat_global = 1;
|
||||
let mayLoad = 0;
|
||||
let mayStore = 1;
|
||||
let has_vdst = 0;
|
||||
let has_vaddr = 0;
|
||||
let has_saddr = 1;
|
||||
let enabled_saddr = 1;
|
||||
let maybeAtomic = 1;
|
||||
}
|
||||
|
||||
class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
|
||||
bit EnableSaddr = 0>: FLAT_Pseudo<
|
||||
opName,
|
||||
@ -526,6 +563,8 @@ defm GLOBAL_LOAD_SBYTE_D16 : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16"
|
||||
defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16_hi", VGPR_32, 1>;
|
||||
defm GLOBAL_LOAD_SHORT_D16 : FLAT_Global_Load_Pseudo <"global_load_short_d16", VGPR_32, 1>;
|
||||
defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo <"global_load_short_d16_hi", VGPR_32, 1>;
|
||||
let OtherPredicates = [HasGFX10_BEncoding] in
|
||||
def GLOBAL_LOAD_DWORD_ADDTID : FLAT_Global_Load_AddTid_Pseudo <"global_load_dword_addtid", VGPR_32>;
|
||||
|
||||
defm GLOBAL_STORE_BYTE : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>;
|
||||
defm GLOBAL_STORE_SHORT : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>;
|
||||
@ -533,6 +572,8 @@ defm GLOBAL_STORE_DWORD : FLAT_Global_Store_Pseudo <"global_store_dword", VGPR
|
||||
defm GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VReg_64>;
|
||||
defm GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>;
|
||||
defm GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>;
|
||||
let OtherPredicates = [HasGFX10_BEncoding] in
|
||||
def GLOBAL_STORE_DWORD_ADDTID : FLAT_Global_Store_AddTid_Pseudo <"global_store_dword_addtid", VGPR_32>;
|
||||
|
||||
defm GLOBAL_STORE_BYTE_D16_HI : FLAT_Global_Store_Pseudo <"global_store_byte_d16_hi", VGPR_32>;
|
||||
defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d16_hi", VGPR_32>;
|
||||
@ -618,6 +659,10 @@ defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_inc_x2",
|
||||
|
||||
defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2",
|
||||
VReg_64, i64, atomic_dec_global_64>;
|
||||
|
||||
let SubtargetPredicate = HasGFX10_BEncoding in
|
||||
defm GLOBAL_ATOMIC_CSUB : FLAT_Global_Atomic_Pseudo_RTN <"global_atomic_csub",
|
||||
VGPR_32, i32, atomic_csub_global_32>;
|
||||
} // End is_flat_global = 1
|
||||
|
||||
|
||||
@ -914,6 +959,7 @@ def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_RTN, atomic_load_or_global_32, i32>;
|
||||
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_RTN, atomic_swap_global_32, i32>;
|
||||
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global_32, i32, v2i32>;
|
||||
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_RTN, atomic_load_xor_global_32, i32>;
|
||||
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CSUB_RTN, atomic_csub_global_32, i32>;
|
||||
|
||||
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_X2_RTN, atomic_load_add_global_64, i64>;
|
||||
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_X2_RTN, atomic_load_sub_global_64, i64>;
|
||||
@ -1214,6 +1260,9 @@ multiclass FLAT_Real_GlblAtomics_gfx10<bits<7> op> :
|
||||
FLAT_Real_RTN_gfx10<op>,
|
||||
FLAT_Real_SADDR_RTN_gfx10<op>;
|
||||
|
||||
multiclass FLAT_Real_GlblAtomics_RTN_gfx10<bits<7> op> :
|
||||
FLAT_Real_RTN_gfx10<op>,
|
||||
FLAT_Real_SADDR_RTN_gfx10<op>;
|
||||
|
||||
// ENC_FLAT.
|
||||
defm FLAT_LOAD_UBYTE : FLAT_Real_Base_gfx10<0x008>;
|
||||
@ -1299,6 +1348,7 @@ defm GLOBAL_ATOMIC_SWAP : FLAT_Real_GlblAtomics_gfx10<0x030>;
|
||||
defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Real_GlblAtomics_gfx10<0x031>;
|
||||
defm GLOBAL_ATOMIC_ADD : FLAT_Real_GlblAtomics_gfx10<0x032>;
|
||||
defm GLOBAL_ATOMIC_SUB : FLAT_Real_GlblAtomics_gfx10<0x033>;
|
||||
defm GLOBAL_ATOMIC_CSUB : FLAT_Real_GlblAtomics_RTN_gfx10<0x034>;
|
||||
defm GLOBAL_ATOMIC_SMIN : FLAT_Real_GlblAtomics_gfx10<0x035>;
|
||||
defm GLOBAL_ATOMIC_UMIN : FLAT_Real_GlblAtomics_gfx10<0x036>;
|
||||
defm GLOBAL_ATOMIC_SMAX : FLAT_Real_GlblAtomics_gfx10<0x037>;
|
||||
@ -1327,7 +1377,8 @@ defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Real_GlblAtomics_gfx10<0x05d>;
|
||||
defm GLOBAL_ATOMIC_FCMPSWAP_X2 : FLAT_Real_GlblAtomics_gfx10<0x05e>;
|
||||
defm GLOBAL_ATOMIC_FMIN_X2 : FLAT_Real_GlblAtomics_gfx10<0x05f>;
|
||||
defm GLOBAL_ATOMIC_FMAX_X2 : FLAT_Real_GlblAtomics_gfx10<0x060>;
|
||||
|
||||
defm GLOBAL_LOAD_DWORD_ADDTID : FLAT_Real_Base_gfx10<0x016>;
|
||||
defm GLOBAL_STORE_DWORD_ADDTID : FLAT_Real_Base_gfx10<0x017>;
|
||||
|
||||
// ENC_FLAT_SCRATCH.
|
||||
defm SCRATCH_LOAD_UBYTE : FLAT_Real_AllAddr_gfx10<0x008>;
|
||||
|
@ -183,3 +183,7 @@ def : ProcessorModel<"gfx1011", GFX10SpeedModel,
|
||||
def : ProcessorModel<"gfx1012", GFX10SpeedModel,
|
||||
FeatureISAVersion10_1_2.Features
|
||||
>;
|
||||
|
||||
def : ProcessorModel<"gfx1030", GFX10SpeedModel,
|
||||
FeatureISAVersion10_3_0.Features
|
||||
>;
|
||||
|
@ -97,6 +97,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
|
||||
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break;
|
||||
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break;
|
||||
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break;
|
||||
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030: AK = GK_GFX1030; break;
|
||||
case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break;
|
||||
}
|
||||
|
||||
@ -148,6 +149,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
|
||||
case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010;
|
||||
case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011;
|
||||
case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012;
|
||||
case GK_GFX1030: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030;
|
||||
case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE;
|
||||
}
|
||||
|
||||
|
@ -829,6 +829,9 @@ defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <0x000000ef, AMDGPUSample_c_cd_cl
|
||||
//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>;
|
||||
//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
|
||||
|
||||
let SubtargetPredicate = HasGFX10_BEncoding in
|
||||
defm IMAGE_MSAA_LOAD : MIMG_NoSampler <0x00000080, "image_msaa_load", 1>;
|
||||
|
||||
/********** ========================================= **********/
|
||||
/********** Table of dimension-aware image intrinsics **********/
|
||||
/********** ========================================= **********/
|
||||
|
@ -333,7 +333,9 @@ enum Id { // HwRegCode, (6) [5:0]
|
||||
ID_FLAT_SCR_HI = 21,
|
||||
ID_XNACK_MASK = 22,
|
||||
ID_POPS_PACKER = 25,
|
||||
ID_SYMBOLIC_LAST_ = 26,
|
||||
ID_SHADER_CYCLES = 29,
|
||||
ID_SYMBOLIC_FIRST_GFX1030_ = ID_SHADER_CYCLES,
|
||||
ID_SYMBOLIC_LAST_ = 30,
|
||||
ID_SHIFT_ = 0,
|
||||
ID_WIDTH_ = 6,
|
||||
ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
|
||||
|
@ -453,10 +453,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
||||
setOperationAction(ISD::FLOG10, MVT::f16, Custom);
|
||||
}
|
||||
|
||||
// v_mad_f32 does not support denormals. We report it as unconditionally
|
||||
// legal, and the context where it is formed will disallow it when fp32
|
||||
// denormals are enabled.
|
||||
setOperationAction(ISD::FMAD, MVT::f32, Legal);
|
||||
if (Subtarget->hasMadMacF32Insts())
|
||||
setOperationAction(ISD::FMAD, MVT::f32, Legal);
|
||||
|
||||
if (!Subtarget->hasBFI()) {
|
||||
// fcopysign can be done in a single instruction with BFI.
|
||||
@ -1130,6 +1128,17 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
|
||||
|
||||
return true;
|
||||
}
|
||||
case Intrinsic::amdgcn_global_atomic_csub: {
|
||||
Info.opc = ISD::INTRINSIC_W_CHAIN;
|
||||
Info.memVT = MVT::getVT(CI.getType());
|
||||
Info.ptrVal = CI.getOperand(0);
|
||||
Info.align.reset();
|
||||
Info.flags = MachineMemOperand::MOLoad |
|
||||
MachineMemOperand::MOStore |
|
||||
MachineMemOperand::MODereferenceable |
|
||||
MachineMemOperand::MOVolatile;
|
||||
return true;
|
||||
}
|
||||
case Intrinsic::amdgcn_ds_gws_init:
|
||||
case Intrinsic::amdgcn_ds_gws_barrier:
|
||||
case Intrinsic::amdgcn_ds_gws_sema_v:
|
||||
@ -4283,7 +4292,8 @@ bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
|
||||
// v_mad_f32/v_mac_f32 do not support denormals.
|
||||
EVT VT = N->getValueType(0);
|
||||
if (VT == MVT::f32)
|
||||
return !hasFP32Denormals(DAG.getMachineFunction());
|
||||
return Subtarget->hasMadMacF32Insts() &&
|
||||
!hasFP32Denormals(DAG.getMachineFunction());
|
||||
if (VT == MVT::f16) {
|
||||
return Subtarget->hasMadF16() &&
|
||||
!hasFP64FP16Denormals(DAG.getMachineFunction());
|
||||
@ -6859,6 +6869,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
||||
case Intrinsic::amdgcn_buffer_atomic_swap:
|
||||
case Intrinsic::amdgcn_buffer_atomic_add:
|
||||
case Intrinsic::amdgcn_buffer_atomic_sub:
|
||||
case Intrinsic::amdgcn_buffer_atomic_csub:
|
||||
case Intrinsic::amdgcn_buffer_atomic_smin:
|
||||
case Intrinsic::amdgcn_buffer_atomic_umin:
|
||||
case Intrinsic::amdgcn_buffer_atomic_smax:
|
||||
@ -6901,6 +6912,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
||||
case Intrinsic::amdgcn_buffer_atomic_sub:
|
||||
Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
|
||||
break;
|
||||
case Intrinsic::amdgcn_buffer_atomic_csub:
|
||||
Opcode = AMDGPUISD::BUFFER_ATOMIC_CSUB;
|
||||
break;
|
||||
case Intrinsic::amdgcn_buffer_atomic_smin:
|
||||
Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
|
||||
break;
|
||||
@ -7149,6 +7163,18 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
||||
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
|
||||
Op->getVTList(), Ops, VT, M->getMemOperand());
|
||||
}
|
||||
case Intrinsic::amdgcn_global_atomic_csub: {
|
||||
MemSDNode *M = cast<MemSDNode>(Op);
|
||||
SDValue Ops[] = {
|
||||
M->getOperand(0), // Chain
|
||||
M->getOperand(2), // Ptr
|
||||
M->getOperand(3) // Value
|
||||
};
|
||||
|
||||
return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_LOAD_CSUB, SDLoc(Op),
|
||||
M->getVTList(), Ops, M->getMemoryVT(),
|
||||
M->getMemOperand());
|
||||
}
|
||||
|
||||
default:
|
||||
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
|
||||
|
@ -28,6 +28,7 @@ def SIEncodingFamily {
|
||||
int GFX9 = 5;
|
||||
int GFX10 = 6;
|
||||
int SDWA10 = 7;
|
||||
int GFX10_B = 8;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
@ -54,6 +55,10 @@ def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2,
|
||||
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
|
||||
>;
|
||||
|
||||
def SIatomic_csub : SDNode<"AMDGPUISD::ATOMIC_LOAD_CSUB", SDTAtomic2,
|
||||
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
|
||||
>;
|
||||
|
||||
def SDTAtomic2_f32 : SDTypeProfile<1, 2, [
|
||||
SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1>
|
||||
]>;
|
||||
@ -197,6 +202,7 @@ def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">;
|
||||
def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">;
|
||||
def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">;
|
||||
def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">;
|
||||
def SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">;
|
||||
def SIbuffer_atomic_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_FADD", f32>;
|
||||
def SIbuffer_atomic_pk_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_PK_FADD", v2f16>;
|
||||
|
||||
@ -305,6 +311,10 @@ class isPackedType<ValueType SrcVT> {
|
||||
// PatFrags for global memory operations
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_global").AddrSpaces in {
|
||||
defm atomic_csub_global : binary_atomic_op<SIatomic_csub>;
|
||||
}
|
||||
|
||||
foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
|
||||
let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
|
||||
|
||||
@ -658,6 +668,7 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
|
||||
|
||||
defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
|
||||
defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
|
||||
defm atomic_load_csub : SIAtomicM0Glue2 <"LOAD_CSUB", 1>;
|
||||
defm atomic_inc : SIAtomicM0Glue2 <"INC", 1>;
|
||||
defm atomic_dec : SIAtomicM0Glue2 <"DEC", 1>;
|
||||
defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
|
||||
@ -1374,6 +1385,7 @@ def HWREG {
|
||||
int FLAT_SCR_HI = 21;
|
||||
int XNACK_MASK = 22;
|
||||
int POPS_PACKER = 25;
|
||||
int SHADER_CYCLES = 29;
|
||||
}
|
||||
|
||||
class getHwRegImm<int Reg, int Offset = 0, int Size = 32> {
|
||||
|
@ -916,6 +916,7 @@ class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr>
|
||||
$src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
|
||||
>;
|
||||
|
||||
let SubtargetPredicate = HasMadMacF32Insts in
|
||||
def : FMADModsPat<f32, V_MAD_F32, AMDGPUfmad_ftz>;
|
||||
def : FMADModsPat<f16, V_MAD_F16, AMDGPUfmad_ftz> {
|
||||
let SubtargetPredicate = Has16BitInsts;
|
||||
|
@ -220,7 +220,7 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
|
||||
// Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
|
||||
void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
|
||||
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
|
||||
if (Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
|
||||
if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
|
||||
return;
|
||||
|
||||
MachineFunction *MF = MI.getParent()->getParent();
|
||||
|
@ -319,6 +319,7 @@ defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores <
|
||||
}
|
||||
} // End SubtargetPredicate = HasScalarStores
|
||||
|
||||
let SubtargetPredicate = HasSMemTimeInst in
|
||||
def S_MEMTIME : SM_Time_Pseudo <"s_memtime", int_amdgcn_s_memtime>;
|
||||
def S_DCACHE_INV : SM_Inval_Pseudo <"s_dcache_inv", int_amdgcn_s_dcache_inv>;
|
||||
|
||||
@ -339,10 +340,11 @@ defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <"s_atc_probe_buffer", SReg_128>;
|
||||
}
|
||||
} // SubtargetPredicate = isGFX8Plus
|
||||
|
||||
let SubtargetPredicate = isGFX10Plus in {
|
||||
let SubtargetPredicate = isGFX10Plus in
|
||||
def S_GL1_INV : SM_Inval_Pseudo<"s_gl1_inv">;
|
||||
let SubtargetPredicate = HasGetWaveIdInst in
|
||||
def S_GET_WAVEID_IN_WORKGROUP : SM_WaveId_Pseudo <"s_get_waveid_in_workgroup", int_amdgcn_s_get_waveid_in_workgroup>;
|
||||
} // End SubtargetPredicate = isGFX10Plus
|
||||
|
||||
|
||||
let SubtargetPredicate = HasScalarFlatScratchInsts, Uses = [FLAT_SCR] in {
|
||||
defm S_SCRATCH_LOAD_DWORD : SM_Pseudo_Loads <"s_scratch_load_dword", SReg_64, SReg_32_XM0_XEXEC>;
|
||||
@ -847,10 +849,21 @@ defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8f32>;
|
||||
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16f32>;
|
||||
} // End let AddedComplexity = 100
|
||||
|
||||
let OtherPredicates = [HasSMemTimeInst] in {
|
||||
def : GCNPat <
|
||||
(i64 (readcyclecounter)),
|
||||
(S_MEMTIME)
|
||||
>;
|
||||
} // let OtherPredicates = [HasSMemTimeInst]
|
||||
|
||||
let OtherPredicates = [HasNoSMemTimeInst] in {
|
||||
def : GCNPat <
|
||||
(i64 (readcyclecounter)),
|
||||
(REG_SEQUENCE SReg_64,
|
||||
(S_GETREG_B32 getHwRegImm<HWREG.SHADER_CYCLES, 0, -12>.ret), sub0,
|
||||
(S_MOV_B32 (i32 0)), sub1)
|
||||
>;
|
||||
} // let OtherPredicates = [HasNoSMemTimeInst]
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// GFX10.
|
||||
|
@ -793,7 +793,11 @@ def S_CBRANCH_I_FORK : SOPK_Pseudo <
|
||||
"$sdst, $simm16"
|
||||
>;
|
||||
|
||||
let hasSideEffects = 1 in {
|
||||
|
||||
let mayLoad = 1 in {
|
||||
// s_getreg_b32 should use hasSideEffects = 1 for tablegen to allow
|
||||
// its use in the readcyclecounter selection.
|
||||
def S_GETREG_B32 : SOPK_Pseudo <
|
||||
"s_getreg_b32",
|
||||
(outs SReg_32:$sdst), (ins hwreg:$simm16),
|
||||
@ -801,7 +805,7 @@ def S_GETREG_B32 : SOPK_Pseudo <
|
||||
>;
|
||||
}
|
||||
|
||||
let hasSideEffects = 1, mayLoad = 0, mayStore =0 in {
|
||||
let mayLoad = 0, mayStore =0 in {
|
||||
|
||||
def S_SETREG_B32 : SOPK_Pseudo <
|
||||
"s_setreg_b32",
|
||||
@ -829,6 +833,7 @@ def S_SETREG_IMM32_B32 : SOPK_Pseudo <
|
||||
let Uses = [MODE];
|
||||
}
|
||||
|
||||
}
|
||||
} // End hasSideEffects = 1
|
||||
|
||||
class SOPK_WAITCNT<string opName, list<dag> pat=[]> :
|
||||
|
@ -78,7 +78,11 @@ const char* const IdSymbolic[] = {
|
||||
"HW_REG_XNACK_MASK",
|
||||
nullptr, // HW_ID1, no predictable values
|
||||
nullptr, // HW_ID2, no predictable values
|
||||
"HW_REG_POPS_PACKER"
|
||||
"HW_REG_POPS_PACKER",
|
||||
nullptr,
|
||||
nullptr,
|
||||
nullptr,
|
||||
"HW_REG_SHADER_CYCLES"
|
||||
};
|
||||
|
||||
} // namespace Hwreg
|
||||
|
@ -311,7 +311,7 @@ unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) {
|
||||
// FIXME: Need to take scratch memory into account.
|
||||
if (!isGFX10(*STI))
|
||||
return 10;
|
||||
return 20;
|
||||
return hasGFX10_3Insts(*STI) ? 16 : 20;
|
||||
}
|
||||
|
||||
unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI,
|
||||
@ -441,12 +441,21 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
|
||||
bool IsWave32 = EnableWavefrontSize32 ?
|
||||
*EnableWavefrontSize32 :
|
||||
STI->getFeatureBits().test(FeatureWavefrontSize32);
|
||||
|
||||
if (hasGFX10_3Insts(*STI))
|
||||
return IsWave32 ? 16 : 8;
|
||||
|
||||
return IsWave32 ? 8 : 4;
|
||||
}
|
||||
|
||||
unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
|
||||
Optional<bool> EnableWavefrontSize32) {
|
||||
return getVGPRAllocGranule(STI, EnableWavefrontSize32);
|
||||
|
||||
bool IsWave32 = EnableWavefrontSize32 ?
|
||||
*EnableWavefrontSize32 :
|
||||
STI->getFeatureBits().test(FeatureWavefrontSize32);
|
||||
|
||||
return IsWave32 ? 8 : 4;
|
||||
}
|
||||
|
||||
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
|
||||
@ -732,13 +741,16 @@ static unsigned getLastSymbolicHwreg(const MCSubtargetInfo &STI) {
|
||||
return ID_SYMBOLIC_FIRST_GFX9_;
|
||||
else if (isGFX9(STI))
|
||||
return ID_SYMBOLIC_FIRST_GFX10_;
|
||||
else if (isGFX10(STI) && !isGFX10_BEncoding(STI))
|
||||
return ID_SYMBOLIC_FIRST_GFX1030_;
|
||||
else
|
||||
return ID_SYMBOLIC_LAST_;
|
||||
}
|
||||
|
||||
bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI) {
|
||||
return ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) &&
|
||||
IdSymbolic[Id];
|
||||
return
|
||||
ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) &&
|
||||
IdSymbolic[Id] && (Id != ID_XNACK_MASK || !AMDGPU::isGFX10_BEncoding(STI));
|
||||
}
|
||||
|
||||
bool isValidHwreg(int64_t Id) {
|
||||
@ -976,6 +988,14 @@ bool isGCN3Encoding(const MCSubtargetInfo &STI) {
|
||||
return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding];
|
||||
}
|
||||
|
||||
bool isGFX10_BEncoding(const MCSubtargetInfo &STI) {
|
||||
return STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding];
|
||||
}
|
||||
|
||||
bool hasGFX10_3Insts(const MCSubtargetInfo &STI) {
|
||||
return STI.getFeatureBits()[AMDGPU::FeatureGFX10_3Insts];
|
||||
}
|
||||
|
||||
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
|
||||
const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
|
||||
const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0);
|
||||
|
@ -558,6 +558,9 @@ bool isCI(const MCSubtargetInfo &STI);
|
||||
bool isVI(const MCSubtargetInfo &STI);
|
||||
bool isGFX9(const MCSubtargetInfo &STI);
|
||||
bool isGFX10(const MCSubtargetInfo &STI);
|
||||
bool isGCN3Encoding(const MCSubtargetInfo &STI);
|
||||
bool isGFX10_BEncoding(const MCSubtargetInfo &STI);
|
||||
bool hasGFX10_3Insts(const MCSubtargetInfo &STI);
|
||||
|
||||
/// Is Reg - scalar register
|
||||
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
|
||||
|
@ -463,6 +463,7 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>;
|
||||
let SubtargetPredicate = HasMadMacF32Insts in
|
||||
def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>;
|
||||
|
||||
let isCommutable = 1 in {
|
||||
@ -489,12 +490,14 @@ defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>;
|
||||
defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>;
|
||||
|
||||
let mayRaiseFPException = 0 in {
|
||||
let SubtargetPredicate = HasMadMacF32Insts in {
|
||||
let Constraints = "$vdst = $src2", DisableEncoding="$src2",
|
||||
isConvertibleToThreeAddress = 1 in {
|
||||
defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>;
|
||||
}
|
||||
|
||||
def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, []>;
|
||||
} // End SubtargetPredicate = HasMadMacF32Insts
|
||||
}
|
||||
|
||||
// No patterns so that the scalar instructions are always selected.
|
||||
@ -553,6 +556,7 @@ defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfma
|
||||
|
||||
let isCommutable = 1 in {
|
||||
let SubtargetPredicate = isGFX6GFX7GFX10 in {
|
||||
let OtherPredicates = [HasMadMacF32Insts] in
|
||||
defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>;
|
||||
} // End SubtargetPredicate = isGFX6GFX7GFX10
|
||||
let SubtargetPredicate = isGFX6GFX7 in {
|
||||
@ -1278,6 +1282,7 @@ let SubtargetPredicate = isGFX6GFX7 in {
|
||||
defm V_ADD_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x003>;
|
||||
defm V_SUB_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x004>;
|
||||
defm V_SUBREV_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x005>;
|
||||
let OtherPredicates = [HasMadMacF32Insts] in
|
||||
defm V_MAC_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x006>;
|
||||
defm V_MUL_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x007>;
|
||||
defm V_MUL_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x008>;
|
||||
@ -1610,3 +1615,9 @@ let SubtargetPredicate = HasDot3Insts in {
|
||||
let SubtargetPredicate = HasPkFmacF16Inst in {
|
||||
defm V_PK_FMAC_F16 : VOP2_Real_e32_vi<0x3c>;
|
||||
} // End SubtargetPredicate = HasPkFmacF16Inst
|
||||
|
||||
let SubtargetPredicate = HasDot3Insts in {
|
||||
// NB: Opcode conflicts with V_DOT2C_F32_F16
|
||||
let DecoderNamespace = "GFX10_B" in
|
||||
defm V_DOT8C_I32_I4 : VOP2_Real_DOT_ACC_gfx10<0x02>;
|
||||
}
|
||||
|
@ -292,8 +292,13 @@ class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> {
|
||||
let isCommutable = 1 in {
|
||||
|
||||
let mayRaiseFPException = 0 in {
|
||||
let SubtargetPredicate = HasMadMacF32Insts in {
|
||||
def V_MAD_LEGACY_F32 : VOP3Inst <"v_mad_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
|
||||
def V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fmad>;
|
||||
} // End SubtargetPredicate = HasMadMacInsts
|
||||
|
||||
let SubtargetPredicate = HasNoMadMacF32Insts in
|
||||
def V_FMA_LEGACY_F32 : VOP3Inst <"v_fma_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
|
||||
}
|
||||
|
||||
def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
|
||||
@ -1020,6 +1025,10 @@ defm V_TRIG_PREOP_F64 : VOP3_Real_gfx6_gfx7_gfx10<0x174>;
|
||||
defm V_DIV_SCALE_F32 : VOP3be_Real_gfx6_gfx7_gfx10<0x16d>;
|
||||
defm V_DIV_SCALE_F64 : VOP3be_Real_gfx6_gfx7_gfx10<0x16e>;
|
||||
|
||||
// NB: Same opcode as v_mad_legacy_f32
|
||||
let DecoderNamespace = "GFX10_B" in
|
||||
defm V_FMA_LEGACY_F32 : VOP3_Real_gfx10<0x140>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// GFX8, GFX9 (VI).
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -3147,7 +3147,7 @@ define i64 @v_udiv_i64_24bit(i64 %num, i64 %den) {
|
||||
; CGP-NEXT: v_rcp_f32_e32 v2, v1
|
||||
; CGP-NEXT: v_mul_f32_e32 v2, v0, v2
|
||||
; CGP-NEXT: v_trunc_f32_e32 v2, v2
|
||||
; CGP-NEXT: v_mad_f32 v0, -v2, v1, v0
|
||||
; CGP-NEXT: v_fma_f32 v0, -v2, v1, v0
|
||||
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
|
||||
; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, v1
|
||||
; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
|
||||
@ -3444,9 +3444,9 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
|
||||
; CGP-NEXT: v_mul_f32_e32 v6, v2, v6
|
||||
; CGP-NEXT: v_trunc_f32_e32 v5, v5
|
||||
; CGP-NEXT: v_trunc_f32_e32 v6, v6
|
||||
; CGP-NEXT: v_mad_f32 v0, -v5, v3, v0
|
||||
; CGP-NEXT: v_fma_f32 v0, -v5, v3, v0
|
||||
; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
|
||||
; CGP-NEXT: v_mad_f32 v2, -v6, v4, v2
|
||||
; CGP-NEXT: v_fma_f32 v2, -v6, v4, v2
|
||||
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
|
||||
; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v0|, v3
|
||||
; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
|
||||
|
@ -3116,7 +3116,7 @@ define i64 @v_urem_i64_24bit(i64 %num, i64 %den) {
|
||||
; CGP-NEXT: v_rcp_f32_e32 v4, v3
|
||||
; CGP-NEXT: v_mul_f32_e32 v4, v2, v4
|
||||
; CGP-NEXT: v_trunc_f32_e32 v4, v4
|
||||
; CGP-NEXT: v_mad_f32 v2, -v4, v3, v2
|
||||
; CGP-NEXT: v_fma_f32 v2, -v4, v3, v2
|
||||
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
|
||||
; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, v3
|
||||
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5]
|
||||
@ -3411,9 +3411,9 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
|
||||
; CGP-NEXT: v_mul_f32_e32 v10, v7, v10
|
||||
; CGP-NEXT: v_trunc_f32_e32 v9, v9
|
||||
; CGP-NEXT: v_trunc_f32_e32 v10, v10
|
||||
; CGP-NEXT: v_mad_f32 v5, -v9, v6, v5
|
||||
; CGP-NEXT: v_fma_f32 v5, -v9, v6, v5
|
||||
; CGP-NEXT: v_cvt_u32_f32_e32 v9, v9
|
||||
; CGP-NEXT: v_mad_f32 v7, -v10, v8, v7
|
||||
; CGP-NEXT: v_fma_f32 v7, -v10, v8, v7
|
||||
; CGP-NEXT: v_cvt_u32_f32_e32 v10, v10
|
||||
; CGP-NEXT: v_cmp_ge_f32_e64 s[4:5], |v5|, v6
|
||||
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5]
|
||||
|
@ -51,6 +51,7 @@
|
||||
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1010 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1010 %s
|
||||
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1011 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1011 %s
|
||||
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1012 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1012 %s
|
||||
; RUN: llc -filetype=obj -march=amdgcn -mcpu=gfx1030 < %s | llvm-readobj -file-headers - | FileCheck --check-prefixes=ALL,ARCH-GCN,GFX1030 %s
|
||||
|
||||
; ARCH-R600: Arch: r600
|
||||
; ARCH-GCN: Arch: amdgcn
|
||||
@ -96,6 +97,7 @@
|
||||
; GFX1010: EF_AMDGPU_MACH_AMDGCN_GFX1010 (0x33)
|
||||
; GFX1011: EF_AMDGPU_MACH_AMDGCN_GFX1011 (0x34)
|
||||
; GFX1012: EF_AMDGPU_MACH_AMDGCN_GFX1012 (0x35)
|
||||
; GFX1030: EF_AMDGPU_MACH_AMDGCN_GFX1030 (0x36)
|
||||
; ALL: ]
|
||||
|
||||
define amdgpu_kernel void @elf_header() {
|
||||
|
@ -1,8 +1,8 @@
|
||||
; RUN: llc -march=amdgcn -mattr=+fast-fmaf -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
|
||||
; RUN: llc -march=amdgcn -mattr=-fast-fmaf -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
|
||||
; RUN: llc -march=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
|
||||
; RUN: llc -march=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
|
||||
|
||||
; RUN: llc -march=amdgcn -mattr=+fast-fmaf -denormal-fp-math-f32=ieee -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s
|
||||
; RUN: llc -march=amdgcn -mattr=-fast-fmaf -denormal-fp-math-f32=ieee -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s
|
||||
; RUN: llc -march=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s
|
||||
; RUN: llc -march=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s
|
||||
|
||||
; FIXME: This should also fold when fma is actually fast if an FMA
|
||||
; exists in the original program.
|
||||
|
@ -17,6 +17,8 @@
|
||||
; FIXME: Should probably test this, but sometimes selecting fmac is painful to match.
|
||||
; XUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx906 -denormal-fp-math-f32=ieee -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM-STRICT,GCN-DENORM,GFX9-DENORM,GCN-DENORM-FASTFMA,GCN-DENORM-FASTFMA-STRICT,GFX906 %s
|
||||
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx1030 -mattr=-fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-FLUSH-STRICT,GCN-FLUSH-FMAC,GCN-FLUSH-FASTFMA,GCN-FLUSH-FASTFMA-STRICT %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -mcpu=gfx1030 -mattr=+fp32-denormals -fp-contract=on < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-DENORM,GCN-DENORM-FASTFMA-STRICT,GCN-DENORM-STRICT %s
|
||||
|
||||
; Test all permutations of: fp32 denormals, fast fp contract, fp contract enabled for fmuladd, fmaf fast/slow.
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+mad-mac-f32-insts -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
|
||||
|
||||
|
@ -28,6 +28,7 @@
|
||||
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX1010 %s
|
||||
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1011 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX1011 %s
|
||||
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1012 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX1012 %s
|
||||
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1030 -mattr=-code-object-v3 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX1030 %s
|
||||
|
||||
; HSA: .hsa_code_object_version 2,1
|
||||
; HSA-SI600: .hsa_code_object_isa 6,0,0,"AMD","AMDGPU"
|
||||
@ -50,3 +51,4 @@
|
||||
; HSA-GFX1010: .hsa_code_object_isa 10,1,0,"AMD","AMDGPU"
|
||||
; HSA-GFX1011: .hsa_code_object_isa 10,1,1,"AMD","AMDGPU"
|
||||
; HSA-GFX1012: .hsa_code_object_isa 10,1,2,"AMD","AMDGPU"
|
||||
; HSA-GFX1030: .hsa_code_object_isa 10,3,0,"AMD","AMDGPU"
|
||||
|
@ -5,6 +5,7 @@
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
|
||||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
|
||||
|
||||
define amdgpu_kernel void @idot8_acc32(<8 x i4> addrspace(1)* %src1,
|
||||
; GFX7-LABEL: idot8_acc32:
|
||||
|
37
test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll
Normal file
37
test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll
Normal file
@ -0,0 +1,37 @@
|
||||
; RUN: llc < %s -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s -check-prefix=GCN
|
||||
|
||||
declare i32 @llvm.amdgcn.buffer.atomic.csub(i32, <4 x i32>, i32, i32, i1)
|
||||
declare i32 @llvm.amdgcn.global.atomic.csub(i32 addrspace(1)*, i32)
|
||||
|
||||
; GCN-LABEL: {{^}}buffer_atomic_csub:
|
||||
; GCN: buffer_atomic_csub v0, v1, s[0:3], 0 idxen glc
|
||||
define amdgpu_ps void @buffer_atomic_csub(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) {
|
||||
main_body:
|
||||
%ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}buffer_atomic_csub_off4_slc:
|
||||
; GCN: buffer_atomic_csub v0, v1, s[0:3], 0 idxen offset:4 glc slc
|
||||
define amdgpu_ps void @buffer_atomic_csub_off4_slc(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) {
|
||||
main_body:
|
||||
%ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}global_atomic_csub:
|
||||
; GCN: global_atomic_csub v{{[0-9]+}}, v[{{[0-9:]+}}], v{{[0-9]+}}, off glc
|
||||
define amdgpu_kernel void @global_atomic_csub(i32 addrspace(1)* %ptr, i32 %data) {
|
||||
main_body:
|
||||
%ret = call i32 @llvm.amdgcn.global.atomic.csub(i32 addrspace(1)* %ptr, i32 %data)
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}global_atomic_csub_off4:
|
||||
; GCN: global_atomic_csub v{{[0-9]+}}, v[{{[0-9:]+}}], v{{[0-9]+}}, off offset:4 glc
|
||||
define amdgpu_kernel void @global_atomic_csub_off4(i32 addrspace(1)* %ptr, i32 %data) {
|
||||
main_body:
|
||||
%p = getelementptr i32, i32 addrspace(1)* %ptr, i64 1
|
||||
%ret = call i32 @llvm.amdgcn.global.atomic.csub(i32 addrspace(1)* %p, i32 %data)
|
||||
ret void
|
||||
}
|
253
test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
Normal file
253
test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
Normal file
@ -0,0 +1,253 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -show-mc-encoding < %s | FileCheck -check-prefixes=GCN,GFX10 %s
|
||||
|
||||
; GCN-LABEL: {{^}}load_1d:
|
||||
; GFX10: image_msaa_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ;
|
||||
define amdgpu_ps <4 x float> @load_1d(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
main_body:
|
||||
%v = call <4 x float> @llvm.amdgcn.image.msaa.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_1d_tfe:
|
||||
; GFX10: image_msaa_load v[0:4], v{{[0-9]+}}, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe ;
|
||||
define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_1d_lwe:
|
||||
; GFX10: image_msaa_load v[0:4], v{{[0-9]+}}, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ;
|
||||
define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
|
||||
main_body:
|
||||
%v = call {<4 x float>, i32} @llvm.amdgcn.image.msaa.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 2, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_2d:
|
||||
; GFX10: image_msaa_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ;
|
||||
define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
|
||||
main_body:
|
||||
%v = call <4 x float> @llvm.amdgcn.image.msaa.load.2d.v4f32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_2d_tfe:
|
||||
; GFX10: image_msaa_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ;
|
||||
define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_3d:
|
||||
; GFX10: image_msaa_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm ;
|
||||
define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
|
||||
main_body:
|
||||
%v = call <4 x float> @llvm.amdgcn.image.msaa.load.3d.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_3d_tfe_lwe:
|
||||
; GFX10: image_msaa_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ;
|
||||
define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %r) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.3d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_1darray:
|
||||
; GFX10: image_msaa_load v[0:3], v[0:1], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm ;
|
||||
define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice) {
|
||||
main_body:
|
||||
%v = call <4 x float> @llvm.amdgcn.image.msaa.load.1darray.v4f32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_1darray_tfe:
|
||||
; GFX10: image_msaa_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm tfe ;
|
||||
define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %slice) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.1darray.v4f32i32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_2darray:
|
||||
; GFX10: image_msaa_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm ;
|
||||
define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
|
||||
main_body:
|
||||
%v = call <4 x float> @llvm.amdgcn.image.msaa.load.2darray.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_2darray_lwe:
|
||||
; GFX10: image_msaa_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe ;
|
||||
define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2darray.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_2dmsaa:
|
||||
; GFX10: image_msaa_load v[0:3], v[0:2], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm ;
|
||||
define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) {
|
||||
main_body:
|
||||
%v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_2dmsaa_both:
|
||||
; GFX10: image_msaa_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ;
|
||||
define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_2darraymsaa:
|
||||
; GFX10: image_msaa_load v[0:3], v[0:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm ;
|
||||
define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
|
||||
main_body:
|
||||
%v = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_2darraymsaa_tfe:
|
||||
; GFX10: image_msaa_load v[0:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ;
|
||||
define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_1d_tfe_V4_dmask3:
|
||||
; GFX10: image_msaa_load v[0:3], v{{[0-9]+}}, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe ;
|
||||
define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.1d.v4f32i32.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_1d_tfe_V4_dmask2:
|
||||
; GFX10: image_msaa_load v[0:2], v{{[0-9]+}}, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm tfe ;
|
||||
define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.1d.v4f32i32.i32(i32 6, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_1d_tfe_V4_dmask1:
|
||||
; GFX10: image_msaa_load v[0:1], v{{[0-9]+}}, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe ;
|
||||
define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.1d.v4f32i32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_1d_tfe_V2_dmask1:
|
||||
; GFX10: image_msaa_load v[0:1], v{{[0-9]+}}, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe ;
|
||||
define amdgpu_ps <2 x float> @load_1d_tfe_V2_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
|
||||
main_body:
|
||||
%v = call {<2 x float>,i32} @llvm.amdgcn.image.msaa.load.1d.v2f32i32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.vec = extractvalue {<2 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<2 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <2 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_1d_V1:
|
||||
; GFX10: image_msaa_load v0, v0, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm ;
|
||||
define amdgpu_ps float @load_1d_V1(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
main_body:
|
||||
%v = call float @llvm.amdgcn.image.msaa.load.1d.f32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
ret float %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_1d_V2:
|
||||
; GFX10: image_msaa_load v[0:1], v0, s[0:7] dmask:0x9 dim:SQ_RSRC_IMG_1D unorm ;
|
||||
define amdgpu_ps <2 x float> @load_1d_V2(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
main_body:
|
||||
%v = call <2 x float> @llvm.amdgcn.image.msaa.load.1d.v2f32.i32(i32 9, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
ret <2 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_1d_glc:
|
||||
; GFX10: image_msaa_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc ;
|
||||
define amdgpu_ps <4 x float> @load_1d_glc(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
main_body:
|
||||
%v = call <4 x float> @llvm.amdgcn.image.msaa.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 1)
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_1d_slc:
|
||||
; GFX10: image_msaa_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm slc ;
|
||||
define amdgpu_ps <4 x float> @load_1d_slc(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
main_body:
|
||||
%v = call <4 x float> @llvm.amdgcn.image.msaa.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_1d_glc_slc:
|
||||
; GFX10: image_msaa_load v[0:3], v0, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc slc ;
|
||||
define amdgpu_ps <4 x float> @load_1d_glc_slc(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
main_body:
|
||||
%v = call <4 x float> @llvm.amdgcn.image.msaa.load.1d.v4f32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 0, i32 3)
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.amdgcn.image.msaa.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {float,i32} @llvm.amdgcn.image.msaa.load.1d.f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {<2 x float>,i32} @llvm.amdgcn.image.msaa.load.1d.v2f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.1d.v4f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.msaa.load.2d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2d.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.msaa.load.3d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.3d.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.msaa.load.1darray.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.1darray.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.msaa.load.2darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2darray.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32i32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
|
||||
declare float @llvm.amdgcn.image.msaa.load.1d.f32.i32(i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare float @llvm.amdgcn.image.msaa.load.2d.f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare <2 x float> @llvm.amdgcn.image.msaa.load.1d.v2f32.i32(i32, i32, <8 x i32>, i32, i32) #1
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readonly }
|
@ -1,6 +1,7 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SIVI -check-prefix=GCN %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=SIVI -check-prefix=GCN %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
||||
; RUN: not --crash llc -march=amdgcn -mcpu=gfx1030 -mattr=-flat-for-global -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GFX1030-ERR %s
|
||||
|
||||
declare i64 @llvm.amdgcn.s.memtime() #0
|
||||
|
||||
@ -12,6 +13,7 @@ declare i64 @llvm.amdgcn.s.memtime() #0
|
||||
; SIVI-NOT: lgkmcnt
|
||||
; GCN: s_memtime s{{\[[0-9]+:[0-9]+\]}}
|
||||
; GCN: {{buffer|global}}_store_dwordx2
|
||||
; GFX1030-ERR: ERROR
|
||||
define amdgpu_kernel void @test_s_memtime(i64 addrspace(1)* %out) #0 {
|
||||
%cycle0 = call i64 @llvm.amdgcn.s.memtime()
|
||||
store volatile i64 %cycle0, i64 addrspace(1)* %out
|
||||
|
@ -1,6 +1,7 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX906
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
|
||||
|
||||
declare i32 @llvm.amdgcn.sdot4(i32 %a, i32 %b, i32 %c, i1 %clamp)
|
||||
|
||||
|
@ -1,14 +1,15 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX906
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX908
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10,GFX1011
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10,GFX1011
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
|
||||
|
||||
declare i32 @llvm.amdgcn.sdot8(i32 %a, i32 %b, i32 %c, i1 %clamp)
|
||||
|
||||
; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8_clamp
|
||||
; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
|
||||
; GFX908: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
|
||||
; GFX10: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
|
||||
; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
|
||||
; GFX908: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
|
||||
; GFX10: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
|
||||
define amdgpu_kernel void @test_llvm_amdgcn_sdot8_clamp(
|
||||
i32 addrspace(1)* %r,
|
||||
<8 x i4> addrspace(1)* %a,
|
||||
@ -26,9 +27,9 @@ entry:
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_llvm_amdgcn_sdot8_no_clamp
|
||||
; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
||||
; GFX908: v_dot8c_i32_i4_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
||||
; GFX1011: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
||||
; GFX906: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
||||
; GFX908: v_dot8c_i32_i4_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
||||
; GFX10: v_dot8_i32_i4 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
|
||||
define amdgpu_kernel void @test_llvm_amdgcn_sdot8_no_clamp(
|
||||
i32 addrspace(1)* %r,
|
||||
<8 x i4> addrspace(1)* %a,
|
||||
|
388
test/CodeGen/AMDGPU/loop-prefetch.ll
Normal file
388
test/CodeGen/AMDGPU/loop-prefetch.ll
Normal file
@ -0,0 +1,388 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs -asm-verbose=0 < %s | FileCheck --check-prefixes=GCN,GFX10,GFX10-ASM %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s -filetype=obj | llvm-objdump -d --arch-name=amdgcn --mcpu=gfx1030 - | FileCheck --check-prefixes=GCN,GFX10,GFX10-DIS %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GFX8 %s
|
||||
|
||||
; GFX8-NOT: s_inst_prefetch
|
||||
; GFX8-NOT: .palign 6
|
||||
|
||||
; GCN-LABEL: test_loop_64
|
||||
; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400
|
||||
; GFX10-DIS-NEXT: {{^$}}
|
||||
; GFX10-ASM-NEXT: [[L1:BB[0-9_]+]]:
|
||||
; GFX10-DIS-NEXT: <[[L1:BB[0-9_]+]]>:
|
||||
; GFX10: s_sleep 0
|
||||
; GFX10: s_cbranch_scc0 [[L1]]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
define amdgpu_kernel void @test_loop_64(i32 addrspace(1)* nocapture %arg) {
|
||||
bb:
|
||||
br label %bb2
|
||||
|
||||
bb1: ; preds = %bb2
|
||||
ret void
|
||||
|
||||
bb2: ; preds = %bb2, %bb
|
||||
%tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ]
|
||||
%tmp2 = add nuw nsw i32 %tmp1, 1
|
||||
%tmp3 = icmp eq i32 %tmp2, 1024
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
br i1 %tmp3, label %bb1, label %bb2
|
||||
}
|
||||
|
||||
; GCN-LABEL: test_loop_128
|
||||
; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400
|
||||
; GFX10-ASM-NEXT: .p2align 6
|
||||
; GFX10-DIS-NEXT: s_nop 0
|
||||
; GFX10-NOT: s_inst_prefetch
|
||||
; GFX10-ASM: [[L1:BB[0-9_]+]]:
|
||||
; GFX10-DIS: <[[L1:BB[0-9_]+]]>:
|
||||
; GFX10: s_sleep 0
|
||||
; GFX10: s_cbranch_scc0 [[L1]]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
define amdgpu_kernel void @test_loop_128(i32 addrspace(1)* nocapture %arg) {
|
||||
bb:
|
||||
br label %bb2
|
||||
|
||||
bb1: ; preds = %bb2
|
||||
ret void
|
||||
|
||||
bb2: ; preds = %bb2, %bb
|
||||
%tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ]
|
||||
%tmp2 = add nuw nsw i32 %tmp1, 1
|
||||
%tmp3 = icmp eq i32 %tmp2, 1024
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
br i1 %tmp3, label %bb1, label %bb2
|
||||
}
|
||||
|
||||
; GCN-LABEL: test_loop_192
|
||||
; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400
|
||||
; GFX10-NEXT: s_inst_prefetch 0x1
|
||||
; GFX10-ASM-NEXT: .p2align 6
|
||||
; GFX10-DIS-NEXT: s_nop 0
|
||||
; GFX10-NOT: s_inst_prefetch
|
||||
; GFX10-ASM: [[L1:BB[0-9_]+]]:
|
||||
; GFX10-DIS: <[[L1:BB[0-9_]+]]>:
|
||||
; GFX10: s_sleep 0
|
||||
; GFX10: s_cbranch_scc0 [[L1]]
|
||||
; GFX10-NEXT: s_inst_prefetch 0x2
|
||||
; GFX10-NEXT: s_endpgm
|
||||
define amdgpu_kernel void @test_loop_192(i32 addrspace(1)* nocapture %arg) {
|
||||
bb:
|
||||
br label %bb2
|
||||
|
||||
bb1: ; preds = %bb2
|
||||
ret void
|
||||
|
||||
bb2: ; preds = %bb2, %bb
|
||||
%tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ]
|
||||
%tmp2 = add nuw nsw i32 %tmp1, 1
|
||||
%tmp3 = icmp eq i32 %tmp2, 1024
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
br i1 %tmp3, label %bb1, label %bb2
|
||||
}
|
||||
|
||||
; GCN-LABEL: test_loop_256
|
||||
; GFX10: s_movk_i32 s{{[0-9]+}}, 0x400
|
||||
; GFX10-DIS-NEXT: {{^$}}
|
||||
; GFX10-ASM-NEXT: [[L1:BB[0-9_]+]]:
|
||||
; GFX10-DIS-NEXT: <[[L1:BB[0-9_]+]]>:
|
||||
; GFX10: s_sleep 0
|
||||
; GFX10: s_cbranch_scc0 [[L1]]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
define amdgpu_kernel void @test_loop_256(i32 addrspace(1)* nocapture %arg) {
|
||||
bb:
|
||||
br label %bb2
|
||||
|
||||
bb1: ; preds = %bb2
|
||||
ret void
|
||||
|
||||
bb2: ; preds = %bb2, %bb
|
||||
%tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb2 ]
|
||||
%tmp2 = add nuw nsw i32 %tmp1, 1
|
||||
%tmp3 = icmp eq i32 %tmp2, 1024
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
br i1 %tmp3, label %bb1, label %bb2
|
||||
}
|
||||
|
||||
; GCN-LABEL: test_loop_prefetch_inner_outer
|
||||
; GFX10: s_inst_prefetch 0x1
|
||||
; GFX10-ASM-NEXT: .p2align 6
|
||||
; GFX10-DIS-NEXT: s_nop 0
|
||||
; GFX10-NOT: s_inst_prefetch
|
||||
; GFX10-ASM: [[L1:BB[0-9_]+]]:
|
||||
; GFX10-DIS: <[[L1:BB[0-9_]+]]>:
|
||||
; GFX10-NOT: s_inst_prefetch
|
||||
; GFX10-ASM: .p2align 6
|
||||
; GFX10-DIS: s_nop 0
|
||||
; GFX10-NOT: s_inst_prefetch
|
||||
; GFX10-ASM: [[L2:BB[0-9_]+]]:
|
||||
; GFX10-DIS: <[[L2:BB[0-9_]+]]>:
|
||||
; GFX10-NOT: s_inst_prefetch
|
||||
; GFX10: s_sleep 0
|
||||
; GFX10: s_cbranch_scc{{[01]}} [[L2]]
|
||||
; GFX10-NOT: s_inst_prefetch
|
||||
; GFX10: s_cbranch_scc{{[01]}} [[L1]]
|
||||
; GFX10-NEXT: s_inst_prefetch 0x2
|
||||
; GFX10-NEXT: s_endpgm
|
||||
define amdgpu_kernel void @test_loop_prefetch_inner_outer(i32 addrspace(1)* nocapture %arg) {
|
||||
bb:
|
||||
br label %bb2
|
||||
|
||||
bb1:
|
||||
ret void
|
||||
|
||||
bb2:
|
||||
%tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb4 ]
|
||||
%tmp2 = add nuw nsw i32 %tmp1, 1
|
||||
%tmp3 = icmp eq i32 %tmp2, 1024
|
||||
br label %bb3
|
||||
|
||||
bb3:
|
||||
%tmp4 = phi i32 [ 0, %bb2 ], [ %tmp5, %bb3 ]
|
||||
%tmp5 = add nuw nsw i32 %tmp4, 1
|
||||
%tmp6 = icmp eq i32 %tmp5, 1024
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
br i1 %tmp6, label %bb4, label %bb3
|
||||
|
||||
bb4:
|
||||
br i1 %tmp3, label %bb1, label %bb2
|
||||
}
|
||||
|
||||
; GCN-LABEL: test_loop_prefetch_inner_outer_noouter
|
||||
; GFX10-NOT: .p2align 6
|
||||
; GFX10-NOT: s_nop
|
||||
; GFX10-NOT: s_inst_prefetch
|
||||
; GFX10-ASM: [[L0:BB[0-9_]+]]:
|
||||
; GFX10-DIS: <[[L0:BB[0-9_]+]]>:
|
||||
; GFX10: s_inst_prefetch 0x1
|
||||
; GFX10-ASM-NEXT: .p2align 6
|
||||
; GFX10-DIS-NEXT: s_nop 0
|
||||
; GFX10-NOT: s_inst_prefetch
|
||||
; GFX10-ASM: [[L1:BB[0-9_]+]]:
|
||||
; GFX10-DIS: <[[L1:BB[0-9_]+]]>:
|
||||
; GFX10-NOT: s_inst_prefetch
|
||||
; GFX10-ASM: .p2align 6
|
||||
; GFX10-DIS: s_nop 0
|
||||
; GFX10-NOT: s_inst_prefetch
|
||||
; GFX10-ASM: [[L2:BB[0-9_]+]]:
|
||||
; GFX10-DIS: <[[L2:BB[0-9_]+]]>:
|
||||
; GFX10-NOT: s_inst_prefetch
|
||||
; GFX10: s_sleep 0
|
||||
; GFX10: s_cbranch_scc{{[01]}} [[L2]]
|
||||
; GFX10-NOT: s_inst_prefetch
|
||||
; GFX10: s_cbranch_scc{{[01]}} [[L1]]
|
||||
; GFX10-NEXT: s_inst_prefetch 0x2
|
||||
; GFX10: s_cbranch_scc{{[01]}} [[L0]]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
define amdgpu_kernel void @test_loop_prefetch_inner_outer_noouter(i32 addrspace(1)* nocapture %arg) {
|
||||
bb:
|
||||
br label %bb2
|
||||
|
||||
bb1:
|
||||
ret void
|
||||
|
||||
bb2:
|
||||
%tmp1 = phi i32 [ 0, %bb ], [ %tmp2, %bb6 ]
|
||||
%tmp2 = add nuw nsw i32 %tmp1, 1
|
||||
%tmp3 = icmp eq i32 %tmp2, 1024
|
||||
br label %bb3
|
||||
|
||||
bb3:
|
||||
%tmp4 = phi i32 [ 0, %bb2 ], [ %tmp5, %bb5 ]
|
||||
%tmp5 = add nuw nsw i32 %tmp4, 1
|
||||
%tmp6 = icmp eq i32 %tmp5, 1024
|
||||
br label %bb4
|
||||
|
||||
bb4:
|
||||
%tmp7 = phi i32 [ 0, %bb3 ], [ %tmp8, %bb4 ]
|
||||
%tmp8 = add nuw nsw i32 %tmp7, 1
|
||||
%tmp9 = icmp eq i32 %tmp8, 1024
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
br i1 %tmp9, label %bb5, label %bb4
|
||||
|
||||
bb5:
|
||||
br i1 %tmp6, label %bb6, label %bb3
|
||||
|
||||
bb6:
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
tail call void @llvm.amdgcn.s.sleep(i32 0)
|
||||
br i1 %tmp3, label %bb1, label %bb2
|
||||
}
|
||||
|
||||
declare void @llvm.amdgcn.s.sleep(i32)
|
@ -1,4 +1,4 @@
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mattr=+mad-mac-f32-insts -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
|
||||
; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
|
||||
|
||||
; FIXME: None of these trigger madmk emission anymore. It is still
|
||||
|
@ -1,4 +1,4 @@
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s
|
||||
|
||||
; CHECK-LABEL: {{^}}fold_sgpr:
|
||||
; CHECK: v_add_i32_e32 v{{[0-9]+}}, vcc, s
|
||||
|
@ -1,6 +1,7 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=MEMTIME -check-prefix=SIVI -check-prefix=GCN %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=MEMTIME -check-prefix=GCN %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GETREG -check-prefix=GCN %s
|
||||
|
||||
declare i64 @llvm.readcyclecounter() #0
|
||||
|
||||
@ -13,6 +14,14 @@ declare i64 @llvm.readcyclecounter() #0
|
||||
; MEMTIME: s_memtime s{{\[[0-9]+:[0-9]+\]}}
|
||||
; MEMTIME: store_dwordx2
|
||||
|
||||
; GETREG-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0
|
||||
; GETREG-DAG: s_getreg_b32 [[CNT1:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES, 0, 20)
|
||||
; GETREG-DAG: v_mov_b32_e32 v[[VCNT1:[0-9]+]], [[CNT1]]
|
||||
; GETREG: global_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[VCNT1]]:[[ZERO]]], off
|
||||
; GETREG: s_getreg_b32 [[CNT2:s[0-9]+]], hwreg(HW_REG_SHADER_CYCLES, 0, 20)
|
||||
; GETREG: v_mov_b32_e32 v[[VCNT2:[0-9]+]], [[CNT2]]
|
||||
; GETREG: global_store_dwordx2 v[{{[0-9:]+}}], v{{\[}}[[VCNT2]]:[[ZERO]]], off
|
||||
|
||||
define amdgpu_kernel void @test_readcyclecounter(i64 addrspace(1)* %out) #0 {
|
||||
%cycle0 = call i64 @llvm.readcyclecounter()
|
||||
store volatile i64 %cycle0, i64 addrspace(1)* %out
|
||||
@ -27,6 +36,7 @@ define amdgpu_kernel void @test_readcyclecounter(i64 addrspace(1)* %out) #0 {
|
||||
; GCN-LABEL: {{^}}test_readcyclecounter_smem:
|
||||
; MEMTIME-DAG: s_memtime
|
||||
; GCN-DAG: s_load_dword
|
||||
; GETREG-DAG: s_getreg_b32 s1, hwreg(HW_REG_SHADER_CYCLES, 0, 20)
|
||||
define amdgpu_cs i32 @test_readcyclecounter_smem(i64 addrspace(4)* inreg %in) #0 {
|
||||
%cycle0 = call i64 @llvm.readcyclecounter()
|
||||
%in.v = load i64, i64 addrspace(4)* %in
|
||||
|
@ -2,6 +2,7 @@
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=VI %s
|
||||
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=fiji -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefix=GCN -check-prefix=GFX1030 %s
|
||||
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
||||
|
||||
@ -186,6 +187,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
|
||||
|
||||
; GCN-LABEL: {{^}}fdiv_test_denormals
|
||||
; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX1030: v_fmac_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
|
||||
define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readonly %arg) {
|
||||
bb:
|
||||
%tmp = load i8, i8 addrspace(1)* null, align 1
|
||||
|
@ -1,4 +1,4 @@
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mattr=+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-FLUSH -check-prefix=GCN %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=VI-DENORM -check-prefix=GCN %s
|
||||
|
||||
|
139
test/MC/AMDGPU/gfx1030_err.s
Normal file
139
test/MC/AMDGPU/gfx1030_err.s
Normal file
@ -0,0 +1,139 @@
|
||||
// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1030 -show-encoding %s 2>&1 | FileCheck --check-prefix=GFX10 %s
|
||||
|
||||
v_dot8c_i32_i4 v5, v1, v2
|
||||
// GFX10: error:
|
||||
|
||||
v_dot8c_i32_i4 v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
|
||||
// GFX10: error:
|
||||
|
||||
v_dot8c_i32_i4 v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
|
||||
// GFX10: error:
|
||||
|
||||
v_dot8c_i32_i4 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
|
||||
// GFX10: error:
|
||||
|
||||
v_dot8c_i32_i4 v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
|
||||
// GFX10: error:
|
||||
|
||||
s_get_waveid_in_workgroup s0
|
||||
// GFX10: error:
|
||||
|
||||
s_memtime s[0:1]
|
||||
// GFX10: error:
|
||||
|
||||
s_getreg_b32 s2, hwreg(HW_REG_XNACK_MASK)
|
||||
// GFX10: error:
|
||||
|
||||
v_mac_f32 v0, v1, v2
|
||||
// GFX10: error:
|
||||
|
||||
v_mad_f32 v0, v1, v2, v3
|
||||
// GFX10: error:
|
||||
|
||||
v_madak_f32 v0, v1, v2, 1
|
||||
// GFX10: error:
|
||||
|
||||
v_madmk_f32 v0, v1, 1, v2
|
||||
// GFX10: error:
|
||||
|
||||
v_mad_legacy_f32 v0, v1, v2, v3
|
||||
// GFX10: error:
|
||||
|
||||
v_mac_legacy_f32 v0, v1, v2
|
||||
// GFX10: error:
|
||||
|
||||
ds_add_src2_u32 v1 offset:65535 gds
|
||||
// GFX10: error:
|
||||
|
||||
ds_add_src2_u32 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_add_src2_f32 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_sub_src2_u32 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_rsub_src2_u32 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_inc_src2_u32 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_dec_src2_u32 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_min_src2_i32 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_max_src2_i32 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_min_src2_u32 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_max_src2_u32 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_and_src2_b32 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_or_src2_b32 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_xor_src2_b32 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_min_src2_f32 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_max_src2_f32 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_add_src2_u64 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_sub_src2_u64 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_rsub_src2_u64 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_inc_src2_u64 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_dec_src2_u64 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_min_src2_i64 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_max_src2_i64 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_min_src2_u64 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_max_src2_u64 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_and_src2_b64 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_or_src2_b64 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_xor_src2_b64 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_min_src2_f64 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_max_src2_f64 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_write_src2_b32 v1 offset:65535
|
||||
// GFX10: error:
|
||||
|
||||
ds_write_src2_b64 v1 offset:65535
|
||||
// GFX10: error:
|
76
test/MC/AMDGPU/gfx1030_new.s
Normal file
76
test/MC/AMDGPU/gfx1030_new.s
Normal file
@ -0,0 +1,76 @@
|
||||
// RUN: llvm-mc -arch=amdgcn -mcpu=gfx1030 -show-encoding %s | FileCheck --check-prefix=GFX10 %s
|
||||
|
||||
global_load_dword_addtid v1, s[2:3] offset:16
|
||||
// GFX10: encoding: [0x10,0x80,0x58,0xdc,0x00,0x00,0x02,0x01]
|
||||
|
||||
global_load_dword_addtid v1, s[2:3] offset:16 glc slc dlc
|
||||
// GFX10: encoding: [0x10,0x90,0x5b,0xdc,0x00,0x00,0x02,0x01]
|
||||
|
||||
global_store_dword_addtid v1, s[2:3] offset:16 glc slc dlc
|
||||
// GFX10: encoding: [0x10,0x90,0x5f,0xdc,0x00,0x01,0x02,0x00]
|
||||
|
||||
global_store_dword v[254:255], v1, s[2:3] offset:16
|
||||
// GFX10: encoding: [0x10,0x80,0x70,0xdc,0xfe,0x01,0x02,0x00]
|
||||
|
||||
global_atomic_csub v2, v[0:1], v2, off offset:100 glc slc
|
||||
// GFX10: encoding: [0x64,0x80,0xd3,0xdc,0x00,0x02,0x7d,0x02]
|
||||
|
||||
global_atomic_csub v2, v[0:1], v2, off
|
||||
// GFX10: encoding: [0x00,0x80,0xd1,0xdc,0x00,0x02,0x7d,0x02]
|
||||
|
||||
global_atomic_csub v2, v[0:1], v2, s[2:3]
|
||||
// GFX10: encoding: [0x00,0x80,0xd1,0xdc,0x00,0x02,0x02,0x02]
|
||||
|
||||
global_atomic_csub v2, v[0:1], v2, s[2:3] offset:100 glc slc
|
||||
// GFX10: encoding: [0x64,0x80,0xd3,0xdc,0x00,0x02,0x02,0x02]
|
||||
|
||||
buffer_atomic_csub v5, off, s[8:11], s3
|
||||
// GFX10: encoding: [0x00,0x40,0xd0,0xe0,0x00,0x05,0x02,0x03]
|
||||
|
||||
buffer_atomic_csub v5, off, s[8:11], s3 offset:4095 glc
|
||||
// GFX10: encoding: [0xff,0x4f,0xd0,0xe0,0x00,0x05,0x02,0x03]
|
||||
|
||||
buffer_atomic_csub v5, off, s[8:11], -1 offset:4095 glc
|
||||
// GFX10: encoding: [0xff,0x4f,0xd0,0xe0,0x00,0x05,0x02,0xc1]
|
||||
|
||||
buffer_atomic_csub v5, v0, s[8:11], s3 offen offset:4095 glc
|
||||
// GFX10: encoding: [0xff,0x5f,0xd0,0xe0,0x00,0x05,0x02,0x03]
|
||||
|
||||
buffer_atomic_csub v5, v0, s[8:11], s3 idxen offset:4095 glc
|
||||
// GFX10: encoding: [0xff,0x6f,0xd0,0xe0,0x00,0x05,0x02,0x03]
|
||||
|
||||
buffer_atomic_csub v5, off, s[8:11], s3 glc slc
|
||||
// GFX10: encoding: [0x00,0x40,0xd0,0xe0,0x00,0x05,0x42,0x03]
|
||||
|
||||
s_getreg_b32 s2, hwreg(HW_REG_SHADER_CYCLES)
|
||||
// GFX10: encoding: [0x1d,0xf8,0x02,0xb9]
|
||||
|
||||
s_getreg_b32 s2, 29
|
||||
// GFX10: s_getreg_b32 s2, hwreg(HW_REG_SHADER_CYCLES, 0, 1) ; encoding: [0x1d,0x00,0x02,0xb9]
|
||||
|
||||
s_getreg_b32 s2, hwreg(22)
|
||||
// GFX10: s_getreg_b32 s2, hwreg(22) ; encoding: [0x16,0xf8,0x02,0xb9]
|
||||
|
||||
v_fma_legacy_f32 v0, v1, v2, v3
|
||||
// GFX10: encoding: [0x00,0x00,0x40,0xd5,0x01,0x05,0x0e,0x04]
|
||||
|
||||
v_fma_legacy_f32 v0, v1, |v2|, -v3
|
||||
// GFX10: encoding: [0x00,0x02,0x40,0xd5,0x01,0x05,0x0e,0x84]
|
||||
|
||||
v_fma_legacy_f32 v0, s1, 2.0, -v3
|
||||
// GFX10: encoding: [0x00,0x00,0x40,0xd5,0x01,0xe8,0x0d,0x84]
|
||||
|
||||
image_msaa_load v[1:4], v5, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D
|
||||
// GFX10: encoding: [0x01,0x0f,0x00,0xf0,0x05,0x01,0x02,0x00]
|
||||
|
||||
image_msaa_load v[1:4], v5, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D glc
|
||||
// GFX10: encoding: [0x01,0x2f,0x00,0xf0,0x05,0x01,0x02,0x00]
|
||||
|
||||
image_msaa_load v5, v[1:2], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_2D d16
|
||||
// GFX10: encoding: [0x09,0x01,0x00,0xf0,0x01,0x05,0x02,0x80]
|
||||
|
||||
image_msaa_load v[1:4], v5, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D
|
||||
// GFX10: encoding: [0x01,0x0f,0x00,0xf0,0x05,0x01,0x02,0x00]
|
||||
|
||||
image_msaa_load v14, [v204,v11,v14,v19], s[40:47] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
|
||||
// GFX10: encoding: [0x3b,0x01,0x00,0xf0,0xcc,0x0e,0x0a,0x00,0x0b,0x0e,0x13,0x00]
|
67
test/MC/Disassembler/AMDGPU/gfx1030_dasm_new.txt
Normal file
67
test/MC/Disassembler/AMDGPU/gfx1030_dasm_new.txt
Normal file
@ -0,0 +1,67 @@
|
||||
# RUN: llvm-mc -arch=amdgcn -mcpu=gfx1030 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX10 %s
|
||||
|
||||
# GFX10: global_load_dword_addtid v1, s[2:3] offset:16
|
||||
0x10,0x80,0x58,0xdc,0x00,0x00,0x02,0x01
|
||||
|
||||
# GFX10: global_load_dword_addtid v1, s[2:3] offset:16 glc slc dlc
|
||||
0x10,0x90,0x5b,0xdc,0x00,0x00,0x02,0x01
|
||||
|
||||
# GFX10: global_store_dword_addtid v1, s[2:3] offset:16 glc slc dlc
|
||||
0x10,0x90,0x5f,0xdc,0x00,0x01,0x02,0x00
|
||||
|
||||
# GFX10: global_store_dword v[254:255], v1, s[2:3] offset:16
|
||||
0x10,0x80,0x70,0xdc,0xfe,0x01,0x02,0x00
|
||||
|
||||
# GFX10: global_atomic_csub v2, v[0:1], v2, off offset:100 glc slc
|
||||
0x64,0x80,0xd3,0xdc,0x00,0x02,0x7d,0x02
|
||||
|
||||
# GFX10: global_atomic_csub v2, v[0:1], v2, off glc
|
||||
0x00,0x80,0xd1,0xdc,0x00,0x02,0x7d,0x02
|
||||
|
||||
# GFX10: global_atomic_csub v2, v[0:1], v2, s[2:3] glc
|
||||
0x00,0x80,0xd1,0xdc,0x00,0x02,0x02,0x02
|
||||
|
||||
# GFX10: global_atomic_csub v2, v[0:1], v2, s[2:3] offset:100 glc slc
|
||||
0x64,0x80,0xd3,0xdc,0x00,0x02,0x02,0x02
|
||||
|
||||
# GFX10: buffer_atomic_csub v5, off, s[8:11], s3
|
||||
0x00,0x40,0xd0,0xe0,0x00,0x05,0x02,0x03
|
||||
|
||||
# GFX10: buffer_atomic_csub v5, off, s[8:11], s3 offset:4095 glc
|
||||
0xff,0x4f,0xd0,0xe0,0x00,0x05,0x02,0x03
|
||||
|
||||
# GFX10: buffer_atomic_csub v5, off, s[8:11], -1 offset:4095 glc
|
||||
0xff,0x4f,0xd0,0xe0,0x00,0x05,0x02,0xc1
|
||||
|
||||
# GFX10: buffer_atomic_csub v5, v0, s[8:11], s3 offen offset:4095 glc
|
||||
0xff,0x5f,0xd0,0xe0,0x00,0x05,0x02,0x03
|
||||
|
||||
# GFX10: buffer_atomic_csub v5, v0, s[8:11], s3 idxen offset:4095 glc
|
||||
0xff,0x6f,0xd0,0xe0,0x00,0x05,0x02,0x03
|
||||
|
||||
# GFX10: buffer_atomic_csub v5, off, s[8:11], s3 glc slc
|
||||
0x00,0x40,0xd0,0xe0,0x00,0x05,0x42,0x03
|
||||
|
||||
# GFX10: v_fma_legacy_f32 v0, v1, v2, v3
|
||||
0x00,0x00,0x40,0xd5,0x01,0x05,0x0e,0x04
|
||||
|
||||
# GFX10: v_fma_legacy_f32 v0, v1, |v2|, -v3
|
||||
0x00,0x02,0x40,0xd5,0x01,0x05,0x0e,0x84
|
||||
|
||||
# GFX10: v_fma_legacy_f32 v0, s1, 2.0, -v3
|
||||
0x00,0x00,0x40,0xd5,0x01,0xe8,0x0d,0x84
|
||||
|
||||
# GFX10: image_msaa_load v[1:4], v5, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D
|
||||
0x01,0x0f,0x00,0xf0,0x05,0x01,0x02,0x00
|
||||
|
||||
# GFX10: image_msaa_load v[1:4], v5, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D glc
|
||||
0x01,0x2f,0x00,0xf0,0x05,0x01,0x02,0x00
|
||||
|
||||
# GFX10: image_msaa_load v5, v[1:2], s[8:15] dmask:0x1 dim:SQ_RSRC_IMG_2D d16
|
||||
0x09,0x01,0x00,0xf0,0x01,0x05,0x02,0x80
|
||||
|
||||
# GFX10: image_msaa_load v[1:4], v5, s[8:15] dmask:0xf dim:SQ_RSRC_IMG_1D
|
||||
0x01,0x0f,0x00,0xf0,0x05,0x01,0x02,0x00
|
||||
|
||||
# GFX10: image_msaa_load v14, [v204, v11, v14, v19], s[40:47] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
|
||||
0x3b,0x01,0x00,0xf0,0xcc,0x0e,0x0a,0x00,0x0b,0x0e,0x13,0x00
|
@ -1797,6 +1797,7 @@ static const EnumEntry<unsigned> ElfHeaderAMDGPUFlags[] = {
|
||||
LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1010),
|
||||
LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1011),
|
||||
LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1012),
|
||||
LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_MACH_AMDGCN_GFX1030),
|
||||
LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_XNACK),
|
||||
LLVM_READOBJ_ENUM_ENT(ELF, EF_AMDGPU_SRAM_ECC)
|
||||
};
|
||||
|
Loading…
x
Reference in New Issue
Block a user