mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
Revert r347871 "Fix: Add support for TFE/LWE in image intrinsic"
Also revert fix r347876 One of the buildbots was reporting a failure in some relevant tests that I can't repro or explain at present, so reverting until I can isolate. llvm-svn: 347911
This commit is contained in:
parent
24aa32ed4a
commit
c4074ba685
@ -590,7 +590,7 @@ class AMDGPUDimSampleProfile<string opmod,
|
||||
AMDGPUDimProps dim,
|
||||
AMDGPUSampleVariant sample> : AMDGPUDimProfile<opmod, dim> {
|
||||
let IsSample = 1;
|
||||
let RetTypes = [llvm_any_ty];
|
||||
let RetTypes = [llvm_anyfloat_ty];
|
||||
let ExtraAddrArgs = sample.ExtraAddrArgs;
|
||||
let Gradients = sample.Gradients;
|
||||
let LodClampMip = sample.LodOrClamp;
|
||||
@ -683,11 +683,11 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimIntrinsics = {
|
||||
}
|
||||
|
||||
defm int_amdgcn_image_load
|
||||
: AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_any_ty], [], [IntrReadMem],
|
||||
: AMDGPUImageDimIntrinsicsAll<"LOAD", [llvm_anyfloat_ty], [], [IntrReadMem],
|
||||
[SDNPMemOperand]>,
|
||||
AMDGPUImageDMaskIntrinsic;
|
||||
defm int_amdgcn_image_load_mip
|
||||
: AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_any_ty], [],
|
||||
: AMDGPUImageDimIntrinsicsNoMsaa<"LOAD_MIP", [llvm_anyfloat_ty], [],
|
||||
[IntrReadMem], [SDNPMemOperand], 1>,
|
||||
AMDGPUImageDMaskIntrinsic;
|
||||
|
||||
|
@ -42,7 +42,6 @@ FunctionPass *createSIFoldOperandsPass();
|
||||
FunctionPass *createSIPeepholeSDWAPass();
|
||||
FunctionPass *createSILowerI1CopiesPass();
|
||||
FunctionPass *createSIFixupVectorISelPass();
|
||||
FunctionPass *createSIAddIMGInitPass();
|
||||
FunctionPass *createSIShrinkInstructionsPass();
|
||||
FunctionPass *createSILoadStoreOptimizerPass();
|
||||
FunctionPass *createSIWholeQuadModePass();
|
||||
@ -154,9 +153,6 @@ extern char &AMDGPUSimplifyLibCallsID;
|
||||
void initializeAMDGPUUseNativeCallsPass(PassRegistry &);
|
||||
extern char &AMDGPUUseNativeCallsID;
|
||||
|
||||
void initializeSIAddIMGInitPass(PassRegistry &);
|
||||
extern char &SIAddIMGInitID;
|
||||
|
||||
void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &);
|
||||
extern char &AMDGPUPerfHintAnalysisID;
|
||||
|
||||
|
@ -367,16 +367,6 @@ def FeatureEnableDS128 : SubtargetFeature<"enable-ds128",
|
||||
"Use ds_{read|write}_b128"
|
||||
>;
|
||||
|
||||
// Sparse texture support requires that all result registers are zeroed when
|
||||
// PRTStrictNull is set to true. This feature is turned on for all architectures
|
||||
// but is enabled as a feature in case there are situations where PRTStrictNull
|
||||
// is disabled by the driver.
|
||||
def FeatureEnablePRTStrictNull : SubtargetFeature<"enable-prt-strict-null",
|
||||
"EnablePRTStrictNull",
|
||||
"true",
|
||||
"Enable zeroing of result registers for sparse texture fetches"
|
||||
>;
|
||||
|
||||
// Unless +-flat-for-global is specified, turn on FlatForGlobal for
|
||||
// all OS-es on VI and newer hardware to avoid assertion failures due
|
||||
// to missing ADDR64 variants of MUBUF instructions.
|
||||
|
@ -74,9 +74,6 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
|
||||
// We want to be able to turn these off, but making this a subtarget feature
|
||||
// for SI has the unhelpful behavior that it unsets everything else if you
|
||||
// disable it.
|
||||
//
|
||||
// Similarly we want enable-prt-strict-null to be on by default and not to
|
||||
// unset everything else if it is disabled
|
||||
|
||||
SmallString<256> FullFS("+promote-alloca,+dx10-clamp,+load-store-opt,");
|
||||
|
||||
@ -92,8 +89,6 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
|
||||
FullFS += "-fp32-denormals,";
|
||||
}
|
||||
|
||||
FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
|
||||
|
||||
FullFS += FS;
|
||||
|
||||
ParseSubtargetFeatures(GPU, FullFS);
|
||||
@ -180,7 +175,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
|
||||
EnableUnsafeDSOffsetFolding(false),
|
||||
EnableSIScheduler(false),
|
||||
EnableDS128(false),
|
||||
EnablePRTStrictNull(false),
|
||||
DumpCode(false),
|
||||
|
||||
FP64(false),
|
||||
|
@ -326,7 +326,6 @@ protected:
|
||||
bool EnableUnsafeDSOffsetFolding;
|
||||
bool EnableSIScheduler;
|
||||
bool EnableDS128;
|
||||
bool EnablePRTStrictNull;
|
||||
bool DumpCode;
|
||||
|
||||
// Subtarget statically properties set by tablegen
|
||||
@ -577,12 +576,6 @@ public:
|
||||
return getGeneration() < AMDGPUSubtarget::GFX9;
|
||||
}
|
||||
|
||||
/// \returns If target requires PRT Struct NULL support (zero result registers
|
||||
/// for sparse texture support).
|
||||
bool usePRTStrictNull() const {
|
||||
return EnablePRTStrictNull;
|
||||
}
|
||||
|
||||
bool hasAutoWaitcntBeforeBarrier() const {
|
||||
return AutoWaitcntBeforeBarrier;
|
||||
}
|
||||
|
@ -815,7 +815,6 @@ bool GCNPassConfig::addInstSelector() {
|
||||
addPass(&SIFixSGPRCopiesID);
|
||||
addPass(createSILowerI1CopiesPass());
|
||||
addPass(createSIFixupVectorISelPass());
|
||||
addPass(createSIAddIMGInitPass());
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -93,7 +93,6 @@ add_llvm_target(AMDGPUCodeGen
|
||||
R600OptimizeVectorRegisters.cpp
|
||||
R600Packetizer.cpp
|
||||
R600RegisterInfo.cpp
|
||||
SIAddIMGInit.cpp
|
||||
SIAnnotateControlFlow.cpp
|
||||
SIDebuggerInsertNops.cpp
|
||||
SIFixSGPRCopies.cpp
|
||||
|
@ -29,7 +29,6 @@ class MIMGBaseOpcode {
|
||||
bit Atomic = 0;
|
||||
bit AtomicX2 = 0; // (f)cmpswap
|
||||
bit Sampler = 0;
|
||||
bit Gather4 = 0;
|
||||
bits<8> NumExtraArgs = 0;
|
||||
bit Gradients = 0;
|
||||
bit Coordinates = 1;
|
||||
@ -44,7 +43,7 @@ def MIMGBaseOpcode : GenericEnum {
|
||||
def MIMGBaseOpcodesTable : GenericTable {
|
||||
let FilterClass = "MIMGBaseOpcode";
|
||||
let CppTypeName = "MIMGBaseOpcodeInfo";
|
||||
let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", "Gather4",
|
||||
let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
|
||||
"NumExtraArgs", "Gradients", "Coordinates", "LodOrClampOrMip",
|
||||
"HasD16"];
|
||||
GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
|
||||
@ -180,8 +179,6 @@ multiclass MIMG_NoSampler <bits<7> op, string asm, bit has_d16, bit mip = 0,
|
||||
defm _V3 : MIMG_NoSampler_Src_Helper <op, asm, VReg_96, 0>;
|
||||
let VDataDwords = 4 in
|
||||
defm _V4 : MIMG_NoSampler_Src_Helper <op, asm, VReg_128, 0>;
|
||||
let VDataDwords = 8 in
|
||||
defm _V8 : MIMG_NoSampler_Src_Helper <op, asm, VReg_256, 0>;
|
||||
}
|
||||
}
|
||||
|
||||
@ -414,8 +411,6 @@ multiclass MIMG_Sampler <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
|
||||
defm _V3 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_96>;
|
||||
let VDataDwords = 4 in
|
||||
defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128>;
|
||||
let VDataDwords = 8 in
|
||||
defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>;
|
||||
}
|
||||
}
|
||||
|
||||
@ -426,7 +421,6 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
|
||||
string asm = "image_gather4"#sample.LowerCaseMod> {
|
||||
def "" : MIMG_Sampler_BaseOpcode<sample> {
|
||||
let HasD16 = 1;
|
||||
let Gather4 = 1;
|
||||
}
|
||||
|
||||
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
|
||||
@ -435,8 +429,6 @@ multiclass MIMG_Gather <bits<7> op, AMDGPUSampleVariant sample, bit wqm = 0,
|
||||
defm _V2 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_64>; /* for packed D16 only */
|
||||
let VDataDwords = 4 in
|
||||
defm _V4 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_128, 1>;
|
||||
let VDataDwords = 8 in
|
||||
defm _V8 : MIMG_Sampler_Src_Helper<op, asm, sample, VReg_256>;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1,181 +0,0 @@
|
||||
//===-- SIAddIMGInit.cpp - Add any required IMG inits ---------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
/// \file
|
||||
/// Any MIMG instructions that use tfe or lwe require an initialization of the
|
||||
/// result register that will be written in the case of a memory access failure
|
||||
/// The required code is also added to tie this init code to the result of the
|
||||
/// img instruction
|
||||
///
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
|
||||
#include "AMDGPU.h"
|
||||
#include "AMDGPUSubtarget.h"
|
||||
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
|
||||
#include "SIInstrInfo.h"
|
||||
#include "llvm/CodeGen/MachineFunctionPass.h"
|
||||
#include "llvm/CodeGen/MachineInstrBuilder.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Target/TargetMachine.h"
|
||||
|
||||
#define DEBUG_TYPE "si-img-init"
|
||||
|
||||
using namespace llvm;
|
||||
|
||||
namespace {
|
||||
|
||||
class SIAddIMGInit : public MachineFunctionPass {
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
public:
|
||||
SIAddIMGInit() : MachineFunctionPass(ID) {
|
||||
initializeSIAddIMGInitPass(*PassRegistry::getPassRegistry());
|
||||
}
|
||||
|
||||
bool runOnMachineFunction(MachineFunction &MF) override;
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.setPreservesCFG();
|
||||
MachineFunctionPass::getAnalysisUsage(AU);
|
||||
}
|
||||
};
|
||||
|
||||
} // End anonymous namespace.
|
||||
|
||||
INITIALIZE_PASS(SIAddIMGInit, DEBUG_TYPE, "SI Add IMG Init", false, false)
|
||||
|
||||
char SIAddIMGInit::ID = 0;
|
||||
|
||||
char &llvm::SIAddIMGInitID = SIAddIMGInit::ID;
|
||||
|
||||
FunctionPass *llvm::createSIAddIMGInitPass() { return new SIAddIMGInit(); }
|
||||
|
||||
bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
|
||||
MachineRegisterInfo &MRI = MF.getRegInfo();
|
||||
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
|
||||
const SIInstrInfo *TII = ST.getInstrInfo();
|
||||
const SIRegisterInfo *RI = ST.getRegisterInfo();
|
||||
bool Changed = false;
|
||||
|
||||
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
|
||||
++BI) {
|
||||
MachineBasicBlock &MBB = *BI;
|
||||
MachineBasicBlock::iterator I, Next;
|
||||
for (I = MBB.begin(); I != MBB.end(); I = Next) {
|
||||
Next = std::next(I);
|
||||
MachineInstr &MI = *I;
|
||||
|
||||
auto Opcode = MI.getOpcode();
|
||||
if (TII->isMIMG(Opcode) && !MI.mayStore()) {
|
||||
MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
|
||||
MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
|
||||
MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
|
||||
|
||||
// Check for instructions that don't have tfe or lwe fields
|
||||
// There shouldn't be any at this point.
|
||||
assert( (TFE && LWE) && "Expected tfe and lwe operands in instruction");
|
||||
|
||||
unsigned TFEVal = TFE->getImm();
|
||||
unsigned LWEVal = LWE->getImm();
|
||||
unsigned D16Val = D16 ? D16->getImm() : 0;
|
||||
|
||||
if (TFEVal || LWEVal) {
|
||||
// At least one of TFE or LWE are non-zero
|
||||
// We have to insert a suitable initialization of the result value and
|
||||
// tie this to the dest of the image instruction.
|
||||
|
||||
const DebugLoc &DL = MI.getDebugLoc();
|
||||
|
||||
int DstIdx =
|
||||
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
|
||||
|
||||
// Calculate which dword we have to initialize to 0.
|
||||
MachineOperand *MO_Dmask =
|
||||
TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
|
||||
|
||||
// check that dmask operand is found.
|
||||
assert(MO_Dmask && "Expected dmask operand in instruction");
|
||||
|
||||
unsigned dmask = MO_Dmask->getImm();
|
||||
// Determine the number of active lanes taking into account the
|
||||
// Gather4 special case
|
||||
unsigned ActiveLanes =
|
||||
TII->isGather4(Opcode) ? 4 : countPopulation(dmask);
|
||||
|
||||
// Subreg indices are counted from 1
|
||||
// When D16 then we want next whole VGPR after write data.
|
||||
static_assert(AMDGPU::sub0 == 1 && AMDGPU::sub4 == 5, "Subreg indices different from expected");
|
||||
|
||||
bool Packed = !ST.hasUnpackedD16VMem();
|
||||
|
||||
unsigned InitIdx =
|
||||
D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
|
||||
|
||||
// Abandon attempt if the dst size isn't large enough
|
||||
// - this is in fact an error but this is picked up elsewhere and
|
||||
// reported correctly.
|
||||
uint32_t DstSize =
|
||||
RI->getRegSizeInBits(*TII->getOpRegClass(MI, DstIdx)) / 32;
|
||||
if (DstSize < InitIdx)
|
||||
continue;
|
||||
|
||||
// Create a register for the intialization value.
|
||||
unsigned PrevDst =
|
||||
MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
|
||||
unsigned NewDst = 0; // Final initialized value will be in here
|
||||
|
||||
// If PRTStrictNull feature is enabled (the default) then initialize
|
||||
// all the result registers to 0, otherwise just the error indication
|
||||
// register (VGPRn+1)
|
||||
unsigned SizeLeft = ST.usePRTStrictNull() ? InitIdx : 1;
|
||||
unsigned CurrIdx = ST.usePRTStrictNull() ? 1 : InitIdx;
|
||||
|
||||
if (DstSize == 1) {
|
||||
// In this case we can just initialize the result directly
|
||||
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), PrevDst)
|
||||
.addImm(0);
|
||||
NewDst = PrevDst;
|
||||
} else {
|
||||
BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
|
||||
for (; SizeLeft; SizeLeft--, CurrIdx++) {
|
||||
NewDst =
|
||||
MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
|
||||
// Initialize dword
|
||||
unsigned SubReg =
|
||||
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
|
||||
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
|
||||
.addImm(0);
|
||||
// Insert into the super-reg
|
||||
BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
|
||||
.addReg(PrevDst)
|
||||
.addReg(SubReg)
|
||||
.addImm(CurrIdx);
|
||||
|
||||
PrevDst = NewDst;
|
||||
}
|
||||
}
|
||||
|
||||
// Add as an implicit operand
|
||||
MachineInstrBuilder(MF, MI).addReg(NewDst, RegState::Implicit);
|
||||
|
||||
// Tie the just added implicit operand to the dst
|
||||
MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
|
||||
|
||||
Changed = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Changed;
|
||||
}
|
@ -216,7 +216,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
||||
|
||||
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
|
||||
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
|
||||
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
|
||||
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
|
||||
|
||||
setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
|
||||
@ -814,47 +813,6 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
|
||||
Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
|
||||
}
|
||||
|
||||
static MVT memVTFromAggregate(Type *Ty) {
|
||||
// Only limited forms of aggregate type currently expected.
|
||||
assert(Ty->isStructTy() && "Expected struct type");
|
||||
|
||||
|
||||
Type *ElementType = nullptr;
|
||||
unsigned NumElts;
|
||||
if (Ty->getContainedType(0)->isVectorTy()) {
|
||||
VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0));
|
||||
ElementType = VecComponent->getElementType();
|
||||
NumElts = VecComponent->getNumElements();
|
||||
} else {
|
||||
ElementType = Ty->getContainedType(0);
|
||||
NumElts = 1;
|
||||
}
|
||||
|
||||
assert(Ty->getContainedType(1)->isIntegerTy(32) && "Expected int32 type");
|
||||
|
||||
// Calculate the size of the memVT type from the aggregate
|
||||
unsigned Pow2Elts = 0;
|
||||
unsigned ElementSize;
|
||||
switch (ElementType->getTypeID()) {
|
||||
default:
|
||||
llvm_unreachable("Unknown type!");
|
||||
case Type::IntegerTyID:
|
||||
ElementSize = cast<IntegerType>(ElementType)->getBitWidth();
|
||||
break;
|
||||
case Type::HalfTyID:
|
||||
ElementSize = 16;
|
||||
break;
|
||||
case Type::FloatTyID:
|
||||
ElementSize = 32;
|
||||
break;
|
||||
}
|
||||
unsigned AdditionalElts = ElementSize == 16 ? 2 : 1;
|
||||
Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts);
|
||||
|
||||
return MVT::getVectorVT(MVT::getVT(ElementType, false),
|
||||
Pow2Elts);
|
||||
}
|
||||
|
||||
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
|
||||
const CallInst &CI,
|
||||
MachineFunction &MF,
|
||||
@ -882,12 +840,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
|
||||
Info.flags = MachineMemOperand::MODereferenceable;
|
||||
if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
|
||||
Info.opc = ISD::INTRINSIC_W_CHAIN;
|
||||
Info.memVT = MVT::getVT(CI.getType(), true);
|
||||
if (Info.memVT == MVT::Other) {
|
||||
// Some intrinsics return an aggregate type - special case to work out
|
||||
// the correct memVT
|
||||
Info.memVT = memVTFromAggregate(CI.getType());
|
||||
}
|
||||
Info.memVT = MVT::getVT(CI.getType());
|
||||
Info.flags |= MachineMemOperand::MOLoad;
|
||||
} else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
|
||||
Info.opc = ISD::INTRINSIC_VOID;
|
||||
@ -4660,109 +4613,6 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
|
||||
return Value == 0;
|
||||
}
|
||||
|
||||
// Re-construct the required return value for a image load intrinsic.
|
||||
// This is more complicated due to the optional use TexFailCtrl which means the required
|
||||
// return type is an aggregate
|
||||
static SDValue constructRetValue(SelectionDAG &DAG,
|
||||
MachineSDNode *Result,
|
||||
ArrayRef<EVT> ResultTypes,
|
||||
bool IsTexFail, bool Unpacked, bool IsD16,
|
||||
int DMaskPop, int NumVDataDwords,
|
||||
const SDLoc &DL, LLVMContext &Context) {
|
||||
// Determine the required return type. This is the same regardless of IsTexFail flag
|
||||
EVT ReqRetVT = ResultTypes[0];
|
||||
EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT;
|
||||
int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
|
||||
EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT;
|
||||
EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts)
|
||||
: AdjEltVT
|
||||
: ReqRetVT;
|
||||
|
||||
// Extract data part of the result
|
||||
// Bitcast the result to the same type as the required return type
|
||||
int NumElts;
|
||||
if (IsD16 && !Unpacked)
|
||||
NumElts = NumVDataDwords << 1;
|
||||
else
|
||||
NumElts = NumVDataDwords;
|
||||
|
||||
EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
|
||||
: AdjEltVT;
|
||||
|
||||
// Special case for v8f16. Rather than add support for this, use v4i32 to
|
||||
// extract the data elements
|
||||
bool V8F16Special = false;
|
||||
if (CastVT == MVT::v8f16) {
|
||||
CastVT = MVT::v4i32;
|
||||
DMaskPop >>= 1;
|
||||
ReqRetNumElts >>= 1;
|
||||
V8F16Special = true;
|
||||
AdjVT = MVT::v2i32;
|
||||
}
|
||||
|
||||
SDValue N = SDValue(Result, 0);
|
||||
SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N);
|
||||
|
||||
// Iterate over the result
|
||||
SmallVector<SDValue, 4> BVElts;
|
||||
|
||||
if (CastVT.isVector()) {
|
||||
DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop);
|
||||
} else {
|
||||
BVElts.push_back(CastRes);
|
||||
}
|
||||
int ExtraElts = ReqRetNumElts - DMaskPop;
|
||||
while(ExtraElts--)
|
||||
BVElts.push_back(DAG.getUNDEF(AdjEltVT));
|
||||
|
||||
SDValue PreTFCRes;
|
||||
if (ReqRetNumElts > 1) {
|
||||
SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts);
|
||||
if (IsD16 && Unpacked)
|
||||
PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked);
|
||||
else
|
||||
PreTFCRes = NewVec;
|
||||
} else {
|
||||
PreTFCRes = BVElts[0];
|
||||
}
|
||||
|
||||
if (V8F16Special)
|
||||
PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
|
||||
|
||||
if (!IsTexFail) {
|
||||
if (Result->getNumValues() > 1)
|
||||
return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL);
|
||||
else
|
||||
return PreTFCRes;
|
||||
}
|
||||
|
||||
// Extract the TexFail result and insert into aggregate return
|
||||
SmallVector<SDValue, 1> TFCElt;
|
||||
DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1);
|
||||
SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]);
|
||||
return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL);
|
||||
}
|
||||
|
||||
static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
|
||||
SDValue *LWE, bool &IsTexFail) {
|
||||
auto TexFailCtrlConst = dyn_cast<ConstantSDNode>(TexFailCtrl.getNode());
|
||||
if (!TexFailCtrlConst)
|
||||
return false;
|
||||
|
||||
uint64_t Value = TexFailCtrlConst->getZExtValue();
|
||||
if (Value) {
|
||||
IsTexFail = true;
|
||||
}
|
||||
|
||||
SDLoc DL(TexFailCtrlConst);
|
||||
*TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
|
||||
Value &= ~(uint64_t)0x1;
|
||||
*LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
|
||||
Value &= ~(uint64_t)0x2;
|
||||
|
||||
return Value == 0;
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::lowerImage(SDValue Op,
|
||||
const AMDGPU::ImageDimIntrinsicInfo *Intr,
|
||||
SelectionDAG &DAG) const {
|
||||
@ -4776,17 +4626,13 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
|
||||
AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
|
||||
unsigned IntrOpcode = Intr->BaseOpcode;
|
||||
|
||||
SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
|
||||
SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
|
||||
SmallVector<EVT, 2> ResultTypes(Op->value_begin(), Op->value_end());
|
||||
bool IsD16 = false;
|
||||
bool IsA16 = false;
|
||||
SDValue VData;
|
||||
int NumVDataDwords;
|
||||
bool AdjustRetType = false;
|
||||
|
||||
unsigned AddrIdx; // Index of first address argument
|
||||
unsigned DMask;
|
||||
unsigned DMaskLanes = 0;
|
||||
|
||||
if (BaseOpcode->Atomic) {
|
||||
VData = Op.getOperand(2);
|
||||
@ -4809,12 +4655,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
|
||||
AddrIdx = 3;
|
||||
}
|
||||
} else {
|
||||
unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1;
|
||||
auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
|
||||
if (!DMaskConst)
|
||||
return Op;
|
||||
DMask = DMaskConst->getZExtValue();
|
||||
DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
|
||||
unsigned DMaskIdx;
|
||||
|
||||
if (BaseOpcode->Store) {
|
||||
VData = Op.getOperand(2);
|
||||
@ -4830,32 +4671,37 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
|
||||
}
|
||||
|
||||
NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
|
||||
DMaskIdx = 3;
|
||||
} else {
|
||||
// Work out the num dwords based on the dmask popcount and underlying type
|
||||
// and whether packing is supported.
|
||||
MVT LoadVT = ResultTypes[0].getSimpleVT();
|
||||
MVT LoadVT = Op.getSimpleValueType();
|
||||
if (LoadVT.getScalarType() == MVT::f16) {
|
||||
if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS ||
|
||||
!BaseOpcode->HasD16)
|
||||
return Op; // D16 is unsupported for this instruction
|
||||
|
||||
IsD16 = true;
|
||||
if (LoadVT.isVector() && Subtarget->hasUnpackedD16VMem())
|
||||
ResultTypes[0] = (LoadVT == MVT::v2f16) ? MVT::v2i32 : MVT::v4i32;
|
||||
}
|
||||
|
||||
// Confirm that the return type is large enough for the dmask specified
|
||||
if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
|
||||
(!LoadVT.isVector() && DMaskLanes > 1))
|
||||
return Op;
|
||||
|
||||
if (IsD16 && !Subtarget->hasUnpackedD16VMem())
|
||||
NumVDataDwords = (DMaskLanes + 1) / 2;
|
||||
else
|
||||
NumVDataDwords = DMaskLanes;
|
||||
|
||||
AdjustRetType = true;
|
||||
NumVDataDwords = (ResultTypes[0].getSizeInBits() + 31) / 32;
|
||||
DMaskIdx = isa<MemSDNode>(Op) ? 2 : 1;
|
||||
}
|
||||
|
||||
auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx));
|
||||
if (!DMaskConst)
|
||||
return Op;
|
||||
|
||||
AddrIdx = DMaskIdx + 1;
|
||||
DMask = DMaskConst->getZExtValue();
|
||||
if (!DMask && !BaseOpcode->Store) {
|
||||
// Eliminate no-op loads. Stores with dmask == 0 are *not* no-op: they
|
||||
// store the channels' default values.
|
||||
SDValue Undef = DAG.getUNDEF(Op.getValueType());
|
||||
if (isa<MemSDNode>(Op))
|
||||
return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
|
||||
return Undef;
|
||||
}
|
||||
}
|
||||
|
||||
unsigned NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
|
||||
@ -4934,53 +4780,11 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
|
||||
CtrlIdx = AddrIdx + NumVAddrs + 3;
|
||||
}
|
||||
|
||||
SDValue TFE;
|
||||
SDValue LWE;
|
||||
SDValue TexFail = Op.getOperand(CtrlIdx);
|
||||
bool IsTexFail = false;
|
||||
if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
|
||||
auto TexFailConst = dyn_cast<ConstantSDNode>(TexFail.getNode());
|
||||
if (!TexFailConst || TexFailConst->getZExtValue() != 0)
|
||||
return Op;
|
||||
|
||||
if (IsTexFail) {
|
||||
if (!NumVDataDwords) {
|
||||
// Expecting to get an error flag since TFC is on - and dmask is 0
|
||||
// Force dmask to be at least 1 otherwise the instruction will fail
|
||||
DMask = 0x1;
|
||||
DMaskLanes = 1;
|
||||
NumVDataDwords = 1;
|
||||
}
|
||||
NumVDataDwords += 1;
|
||||
AdjustRetType = true;
|
||||
}
|
||||
|
||||
// Has something earlier tagged that the return type needs adjusting
|
||||
// This happens if the instruction is a load or has set TexFailCtrl flags
|
||||
if (AdjustRetType) {
|
||||
// NumVDataDwords reflects the true number of dwords required in the return type
|
||||
if (NumVDataDwords == 0 && !BaseOpcode->Store) {
|
||||
// This is a no-op load. This can be eliminated
|
||||
SDValue Undef = DAG.getUNDEF(Op.getValueType());
|
||||
if (isa<MemSDNode>(Op))
|
||||
return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
|
||||
return Undef;
|
||||
}
|
||||
|
||||
// Have to use a power of 2 number of dwords
|
||||
NumVDataDwords = 1 << Log2_32_Ceil(NumVDataDwords);
|
||||
|
||||
EVT NewVT = NumVDataDwords > 1 ?
|
||||
EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
|
||||
: MVT::f32;
|
||||
|
||||
ResultTypes[0] = NewVT;
|
||||
if (ResultTypes.size() == 3) {
|
||||
// Original result was aggregate type used for TexFailCtrl results
|
||||
// The actual instruction returns as a vector type which has now been
|
||||
// created. Remove the aggregate result.
|
||||
ResultTypes.erase(&ResultTypes[1]);
|
||||
}
|
||||
}
|
||||
|
||||
SDValue GLC;
|
||||
SDValue SLC;
|
||||
if (BaseOpcode->Atomic) {
|
||||
@ -5005,8 +4809,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
|
||||
Ops.push_back(SLC);
|
||||
Ops.push_back(IsA16 && // a16 or r128
|
||||
ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
|
||||
Ops.push_back(TFE); // tfe
|
||||
Ops.push_back(LWE); // lwe
|
||||
Ops.push_back(False); // tfe
|
||||
Ops.push_back(False); // lwe
|
||||
Ops.push_back(DimInfo->DA ? True : False);
|
||||
if (BaseOpcode->HasD16)
|
||||
Ops.push_back(IsD16 ? True : False);
|
||||
@ -5034,12 +4838,11 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
|
||||
SmallVector<SDValue, 1> Elt;
|
||||
DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
|
||||
return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
|
||||
} else if (!BaseOpcode->Store) {
|
||||
return constructRetValue(DAG, NewNode,
|
||||
OrigResultTypes, IsTexFail,
|
||||
Subtarget->hasUnpackedD16VMem(), IsD16,
|
||||
DMaskLanes, NumVDataDwords, DL,
|
||||
*DAG.getContext());
|
||||
} else if (IsD16 && !BaseOpcode->Store) {
|
||||
MVT LoadVT = Op.getSimpleValueType();
|
||||
SDValue Adjusted = adjustLoadValueTypeImpl(
|
||||
SDValue(NewNode, 0), LoadVT, DL, DAG, Subtarget->hasUnpackedD16VMem());
|
||||
return DAG.getMergeValues({Adjusted, SDValue(NewNode, 1)}, DL);
|
||||
}
|
||||
|
||||
return SDValue(NewNode, 0);
|
||||
@ -8969,7 +8772,6 @@ static unsigned SubIdx2Lane(unsigned Idx) {
|
||||
case AMDGPU::sub1: return 1;
|
||||
case AMDGPU::sub2: return 2;
|
||||
case AMDGPU::sub3: return 3;
|
||||
case AMDGPU::sub4: return 4; // Possible with TFE/LWE
|
||||
}
|
||||
}
|
||||
|
||||
@ -8983,16 +8785,11 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
|
||||
if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
|
||||
return Node; // not implemented for D16
|
||||
|
||||
SDNode *Users[5] = { nullptr };
|
||||
SDNode *Users[4] = { nullptr };
|
||||
unsigned Lane = 0;
|
||||
unsigned DmaskIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
|
||||
unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
|
||||
unsigned NewDmask = 0;
|
||||
unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
|
||||
unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
|
||||
bool UsesTFC = (Node->getConstantOperandVal(TFEIdx) ||
|
||||
Node->getConstantOperandVal(LWEIdx)) ? 1 : 0;
|
||||
unsigned TFCLane = 0;
|
||||
bool HasChain = Node->getNumValues() > 1;
|
||||
|
||||
if (OldDmask == 0) {
|
||||
@ -9000,12 +8797,6 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
|
||||
return Node;
|
||||
}
|
||||
|
||||
unsigned OldBitsSet = countPopulation(OldDmask);
|
||||
// Work out which is the TFE/LWE lane if that is enabled.
|
||||
if (UsesTFC) {
|
||||
TFCLane = OldBitsSet;
|
||||
}
|
||||
|
||||
// Try to figure out the used register components
|
||||
for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
|
||||
I != E; ++I) {
|
||||
@ -9025,49 +8816,28 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
|
||||
// set, etc.
|
||||
Lane = SubIdx2Lane(I->getConstantOperandVal(1));
|
||||
|
||||
// Check if the use is for the TFE/LWE generated result at VGPRn+1.
|
||||
if (UsesTFC && Lane == TFCLane) {
|
||||
Users[Lane] = *I;
|
||||
} else {
|
||||
// Set which texture component corresponds to the lane.
|
||||
unsigned Comp;
|
||||
for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
|
||||
Comp = countTrailingZeros(Dmask);
|
||||
Dmask &= ~(1 << Comp);
|
||||
}
|
||||
|
||||
// Abort if we have more than one user per component.
|
||||
if (Users[Lane])
|
||||
return Node;
|
||||
|
||||
Users[Lane] = *I;
|
||||
NewDmask |= 1 << Comp;
|
||||
// Set which texture component corresponds to the lane.
|
||||
unsigned Comp;
|
||||
for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
|
||||
Comp = countTrailingZeros(Dmask);
|
||||
Dmask &= ~(1 << Comp);
|
||||
}
|
||||
|
||||
// Abort if we have more than one user per component
|
||||
if (Users[Lane])
|
||||
return Node;
|
||||
|
||||
Users[Lane] = *I;
|
||||
NewDmask |= 1 << Comp;
|
||||
}
|
||||
|
||||
// Don't allow 0 dmask, as hardware assumes one channel enabled.
|
||||
bool NoChannels = !NewDmask;
|
||||
if (NoChannels) {
|
||||
// If the original dmask has one channel - then nothing to do
|
||||
if (OldBitsSet == 1)
|
||||
return Node;
|
||||
// Use an arbitrary dmask - required for the instruction to work
|
||||
NewDmask = 1;
|
||||
}
|
||||
// Abort if there's no change
|
||||
if (NewDmask == OldDmask)
|
||||
return Node;
|
||||
|
||||
unsigned BitsSet = countPopulation(NewDmask);
|
||||
|
||||
// Check for TFE or LWE - increase the number of channels by one to account
|
||||
// for the extra return value
|
||||
// This will need adjustment for D16 if this is also included in
|
||||
// adjustWriteMask (this function) but at present D16 are excluded.
|
||||
unsigned NewChannels = BitsSet + UsesTFC;
|
||||
|
||||
int NewOpcode =
|
||||
AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
|
||||
int NewOpcode = AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), BitsSet);
|
||||
assert(NewOpcode != -1 &&
|
||||
NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
|
||||
"failed to find equivalent MIMG op");
|
||||
@ -9080,9 +8850,8 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
|
||||
|
||||
MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
|
||||
|
||||
MVT ResultVT = NewChannels == 1 ?
|
||||
SVT : MVT::getVectorVT(SVT, NewChannels == 3 ? 4 :
|
||||
NewChannels == 5 ? 8 : NewChannels);
|
||||
MVT ResultVT = BitsSet == 1 ?
|
||||
SVT : MVT::getVectorVT(SVT, BitsSet == 3 ? 4 : BitsSet);
|
||||
SDVTList NewVTList = HasChain ?
|
||||
DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
|
||||
|
||||
@ -9096,7 +8865,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
|
||||
DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
|
||||
}
|
||||
|
||||
if (NewChannels == 1) {
|
||||
if (BitsSet == 1) {
|
||||
assert(Node->hasNUsesOfValue(1, 0));
|
||||
SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY,
|
||||
SDLoc(Node), Users[Lane]->getValueType(0),
|
||||
@ -9106,24 +8875,19 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
|
||||
}
|
||||
|
||||
// Update the users of the node with the new indices
|
||||
for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
|
||||
for (unsigned i = 0, Idx = AMDGPU::sub0; i < 4; ++i) {
|
||||
SDNode *User = Users[i];
|
||||
if (!User) {
|
||||
// Handle the special case of NoChannels. We set NewDmask to 1 above, but
|
||||
// Users[0] is still nullptr because channel 0 doesn't really have a use.
|
||||
if (i || !NoChannels)
|
||||
continue;
|
||||
} else {
|
||||
SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
|
||||
DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
|
||||
}
|
||||
if (!User)
|
||||
continue;
|
||||
|
||||
SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
|
||||
DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
|
||||
|
||||
switch (Idx) {
|
||||
default: break;
|
||||
case AMDGPU::sub0: Idx = AMDGPU::sub1; break;
|
||||
case AMDGPU::sub1: Idx = AMDGPU::sub2; break;
|
||||
case AMDGPU::sub2: Idx = AMDGPU::sub3; break;
|
||||
case AMDGPU::sub3: Idx = AMDGPU::sub4; break;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2968,42 +2968,6 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
|
||||
}
|
||||
}
|
||||
|
||||
// Verify MIMG
|
||||
if (isMIMG(MI.getOpcode()) && !MI.mayStore()) {
|
||||
// Ensure that the return type used is large enough for all the options
|
||||
// being used TFE/LWE require an extra result register.
|
||||
const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
|
||||
if (DMask) {
|
||||
uint64_t DMaskImm = DMask->getImm();
|
||||
uint32_t RegCount =
|
||||
isGather4(MI.getOpcode()) ? 4 : countPopulation(DMaskImm);
|
||||
const MachineOperand *TFE = getNamedOperand(MI, AMDGPU::OpName::tfe);
|
||||
const MachineOperand *LWE = getNamedOperand(MI, AMDGPU::OpName::lwe);
|
||||
const MachineOperand *D16 = getNamedOperand(MI, AMDGPU::OpName::d16);
|
||||
|
||||
// Adjust for packed 16 bit values
|
||||
if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
|
||||
RegCount >>= 1;
|
||||
|
||||
// Adjust if using LWE or TFE
|
||||
if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
|
||||
RegCount += 1;
|
||||
|
||||
const uint32_t DstIdx =
|
||||
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
|
||||
const MachineOperand &Dst = MI.getOperand(DstIdx);
|
||||
if (Dst.isReg()) {
|
||||
const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
|
||||
uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
|
||||
if (RegCount > DstSize) {
|
||||
ErrInfo = "MIMG instruction returns too many registers for dst "
|
||||
"register class";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Verify VOP*. Ignore multiple sgpr operands on writelane.
|
||||
if (Desc.getOpcode() != AMDGPU::V_WRITELANE_B32
|
||||
&& (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI) || isSDWA(MI))) {
|
||||
|
@ -184,7 +184,6 @@ struct MIMGBaseOpcodeInfo {
|
||||
bool Atomic;
|
||||
bool AtomicX2;
|
||||
bool Sampler;
|
||||
bool Gather4;
|
||||
|
||||
uint8_t NumExtraArgs;
|
||||
bool Gradients;
|
||||
|
@ -802,8 +802,7 @@ private:
|
||||
|
||||
Value *simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
|
||||
APInt DemandedElts,
|
||||
int DmaskIdx = -1,
|
||||
int TFCIdx = -1);
|
||||
int DmaskIdx = -1);
|
||||
|
||||
Value *SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
|
||||
APInt &UndefElts, unsigned Depth = 0);
|
||||
|
@ -969,24 +969,11 @@ InstCombiner::simplifyShrShlDemandedBits(Instruction *Shr, const APInt &ShrOp1,
|
||||
/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
|
||||
Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
|
||||
APInt DemandedElts,
|
||||
int DMaskIdx,
|
||||
int TFCIdx) {
|
||||
int DMaskIdx) {
|
||||
unsigned VWidth = II->getType()->getVectorNumElements();
|
||||
if (VWidth == 1)
|
||||
return nullptr;
|
||||
|
||||
// Need to change to new instruction format
|
||||
ConstantInt *TFC = nullptr;
|
||||
bool TFELWEEnabled = false;
|
||||
if (TFCIdx > 0) {
|
||||
TFC = dyn_cast<ConstantInt>(II->getArgOperand(TFCIdx));
|
||||
TFELWEEnabled = TFC->getZExtValue() & 0x1 // TFE
|
||||
|| TFC->getZExtValue() & 0x2; // LWE
|
||||
}
|
||||
|
||||
if (TFELWEEnabled)
|
||||
return nullptr; // TFE not yet supported
|
||||
|
||||
ConstantInt *NewDMask = nullptr;
|
||||
|
||||
if (DMaskIdx < 0) {
|
||||
@ -1635,8 +1622,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
|
||||
return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts);
|
||||
default: {
|
||||
if (getAMDGPUImageDMaskIntrinsic(II->getIntrinsicID()))
|
||||
return simplifyAMDGCNMemoryIntrinsicDemanded(
|
||||
II, DemandedElts, 0, II->getNumArgOperands() - 2);
|
||||
return simplifyAMDGCNMemoryIntrinsicDemanded(II, DemandedElts, 0);
|
||||
|
||||
break;
|
||||
}
|
||||
|
@ -1,7 +1,6 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI,SIVI,PRT %s
|
||||
; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI,SIVI,PRT %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,PRT %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-enable-prt-strict-null -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900,NOPRT %s
|
||||
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
|
||||
|
||||
; GCN-LABEL: {{^}}load_1d:
|
||||
; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm{{$}}
|
||||
@ -11,52 +10,6 @@ main_body:
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_1d_tfe:
|
||||
; PRT: v_mov_b32_e32 v0, 0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; NOPRT: v_mov_b32_e32 v4, 0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v1
|
||||
; NOPRT-NOT: v_mov_b32_e32 v2
|
||||
; NOPRT-NOT: v_mov_b32_e32 v3
|
||||
; GCN: image_load v[0:7], v{{[0-9]+}}, s[0:7] dmask:0xf unorm tfe{{$}}
|
||||
; SIVI: buffer_store_dword v4, off, s[8:11], 0
|
||||
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
|
||||
define amdgpu_ps <4 x float> @load_1d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_1d_lwe:
|
||||
; PRT: v_mov_b32_e32 v0, 0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; NOPRT: v_mov_b32_e32 v4, 0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v1
|
||||
; NOPRT-NOT: v_mov_b32_e32 v2
|
||||
; NOPRT-NOT: v_mov_b32_e32 v3
|
||||
; GCN: image_load v[0:7], v{{[0-9]+}}, s[0:7] dmask:0xf unorm lwe{{$}}
|
||||
; SIVI: buffer_store_dword v4, off, s[8:11], 0
|
||||
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
|
||||
define amdgpu_ps <4 x float> @load_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
|
||||
main_body:
|
||||
%v = call {<4 x float>, i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 15, i32 %s, <8 x i32> %rsrc, i32 2, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_2d:
|
||||
; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}}
|
||||
define amdgpu_ps <4 x float> @load_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t) {
|
||||
@ -65,29 +18,6 @@ main_body:
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_2d_tfe:
|
||||
; PRT: v_mov_b32_e32 v0, 0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; NOPRT: v_mov_b32_e32 v4, 0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v1
|
||||
; NOPRT-NOT: v_mov_b32_e32 v2
|
||||
; NOPRT-NOT: v_mov_b32_e32 v3
|
||||
; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}}
|
||||
; SIVI: buffer_store_dword v4, off, s[8:11], 0
|
||||
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
|
||||
define amdgpu_ps <4 x float> @load_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_3d:
|
||||
; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
|
||||
define amdgpu_ps <4 x float> @load_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r) {
|
||||
@ -96,29 +26,6 @@ main_body:
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_3d_tfe_lwe:
|
||||
; PRT: v_mov_b32_e32 v0, 0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; NOPRT: v_mov_b32_e32 v4, 0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v1
|
||||
; NOPRT-NOT: v_mov_b32_e32 v2
|
||||
; NOPRT-NOT: v_mov_b32_e32 v3
|
||||
; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}}
|
||||
; SIVI: buffer_store_dword v4, off, s[8:11], 0
|
||||
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
|
||||
define amdgpu_ps <4 x float> @load_3d_tfe_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %r) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 3, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_cube:
|
||||
; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
|
||||
define amdgpu_ps <4 x float> @load_cube(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
|
||||
@ -127,29 +34,6 @@ main_body:
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_cube_lwe:
|
||||
; PRT: v_mov_b32_e32 v0, 0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; NOPRT: v_mov_b32_e32 v4, 0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v1
|
||||
; NOPRT-NOT: v_mov_b32_e32 v2
|
||||
; NOPRT-NOT: v_mov_b32_e32 v3
|
||||
; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}}
|
||||
; SIVI: buffer_store_dword v4, off, s[8:11], 0
|
||||
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
|
||||
define amdgpu_ps <4 x float> @load_cube_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_1darray:
|
||||
; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm da{{$}}
|
||||
define amdgpu_ps <4 x float> @load_1darray(<8 x i32> inreg %rsrc, i32 %s, i32 %slice) {
|
||||
@ -158,29 +42,6 @@ main_body:
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_1darray_tfe:
|
||||
; PRT: v_mov_b32_e32 v0, 0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; NOPRT: v_mov_b32_e32 v4, 0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v1
|
||||
; NOPRT-NOT: v_mov_b32_e32 v2
|
||||
; NOPRT-NOT: v_mov_b32_e32 v3
|
||||
; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}}
|
||||
; SIVI: buffer_store_dword v4, off, s[8:11], 0
|
||||
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
|
||||
define amdgpu_ps <4 x float> @load_1darray_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %slice) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32 15, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_2darray:
|
||||
; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
|
||||
define amdgpu_ps <4 x float> @load_2darray(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice) {
|
||||
@ -189,29 +50,6 @@ main_body:
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_2darray_lwe:
|
||||
; PRT: v_mov_b32_e32 v0, 0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; NOPRT: v_mov_b32_e32 v4, 0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v1
|
||||
; NOPRT-NOT: v_mov_b32_e32 v2
|
||||
; NOPRT-NOT: v_mov_b32_e32 v3
|
||||
; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe da{{$}}
|
||||
; SIVI: buffer_store_dword v4, off, s[8:11], 0
|
||||
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
|
||||
define amdgpu_ps <4 x float> @load_2darray_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 2, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_2dmsaa:
|
||||
; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
|
||||
define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %fragid) {
|
||||
@ -220,29 +58,6 @@ main_body:
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_2dmsaa_both:
|
||||
; PRT: v_mov_b32_e32 v0, 0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; NOPRT: v_mov_b32_e32 v4, 0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v1
|
||||
; NOPRT-NOT: v_mov_b32_e32 v2
|
||||
; NOPRT-NOT: v_mov_b32_e32 v3
|
||||
; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe lwe{{$}}
|
||||
; SIVI: buffer_store_dword v4, off, s[8:11], 0
|
||||
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
|
||||
define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %fragid) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_2darraymsaa:
|
||||
; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm da{{$}}
|
||||
define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
|
||||
@ -251,29 +66,6 @@ main_body:
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_2darraymsaa_tfe:
|
||||
; PRT: v_mov_b32_e32 v0, 0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; NOPRT: v_mov_b32_e32 v4, 0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v1
|
||||
; NOPRT-NOT: v_mov_b32_e32 v2
|
||||
; NOPRT-NOT: v_mov_b32_e32 v3
|
||||
; GCN: image_load v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe da{{$}}
|
||||
; SIVI: buffer_store_dword v4, off, s[8:11], 0
|
||||
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
|
||||
define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_mip_1d:
|
||||
; GCN: image_load_mip v[0:3], v[0:1], s[0:7] dmask:0xf unorm{{$}}
|
||||
define amdgpu_ps <4 x float> @load_mip_1d(<8 x i32> inreg %rsrc, i32 %s, i32 %mip) {
|
||||
@ -282,29 +74,6 @@ main_body:
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_mip_1d_lwe:
|
||||
; PRT: v_mov_b32_e32 v0, 0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; NOPRT: v_mov_b32_e32 v4, 0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v1
|
||||
; NOPRT-NOT: v_mov_b32_e32 v2
|
||||
; NOPRT-NOT: v_mov_b32_e32 v3
|
||||
; GCN: image_load_mip v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm lwe{{$}}
|
||||
; SIVI: buffer_store_dword v4, off, s[8:11], 0
|
||||
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
|
||||
define amdgpu_ps <4 x float> @load_mip_1d_lwe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %mip) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.1d.v4f32i32.i32(i32 15, i32 %s, i32 %mip, <8 x i32> %rsrc, i32 2, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_mip_2d:
|
||||
; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
|
||||
define amdgpu_ps <4 x float> @load_mip_2d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
|
||||
@ -313,191 +82,6 @@ main_body:
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_mip_2d_tfe:
|
||||
; PRT: v_mov_b32_e32 v0, 0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; NOPRT: v_mov_b32_e32 v4, 0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v1
|
||||
; NOPRT-NOT: v_mov_b32_e32 v2
|
||||
; NOPRT-NOT: v_mov_b32_e32 v3
|
||||
; GCN: image_load_mip v[0:7], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0xf unorm tfe{{$}}
|
||||
; SIVI: buffer_store_dword v4, off, s[8:11], 0
|
||||
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v4
|
||||
define amdgpu_ps <4 x float> @load_mip_2d_tfe(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s, i32 %t, i32 %mip) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; Make sure that error flag is returned even with dmask 0
|
||||
; GCN-LABEL: {{^}}load_1d_V2_tfe_dmask0:
|
||||
; GCN: v_mov_b32_e32 v1, 0
|
||||
; PRT-DAG: v_mov_b32_e32 v2, v1
|
||||
; PRT: image_load v[1:2], v0, s[0:7] dmask:0x1 unorm tfe{{$}}
|
||||
; NOPRT-NOT: v_mov_b32_e32 v1
|
||||
; NOPRT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe{{$}}
|
||||
define amdgpu_ps float @load_1d_V2_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
main_body:
|
||||
%v = call {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.err = extractvalue {<2 x float>, i32} %v, 1
|
||||
%vv = bitcast i32 %v.err to float
|
||||
ret float %vv
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_1d_V1_tfe_dmask0:
|
||||
; GCN: v_mov_b32_e32 v1, 0
|
||||
; PRT-DAG: v_mov_b32_e32 v2, v1
|
||||
; PRT: image_load v[1:2], v0, s[0:7] dmask:0x1 unorm tfe{{$}}
|
||||
; NOPRT-NOT: v_mov_b32_e32 v1
|
||||
; NOPRT: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm tfe{{$}}
|
||||
define amdgpu_ps float @load_1d_V1_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s) {
|
||||
main_body:
|
||||
%v = call {float,i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32 0, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.err = extractvalue {float, i32} %v, 1
|
||||
%vv = bitcast i32 %v.err to float
|
||||
ret float %vv
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_mip_2d_tfe_dmask0:
|
||||
; GCN: v_mov_b32_e32 v3, 0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v3
|
||||
; PRT: image_load_mip v[3:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
|
||||
; NOPRT-NOT: v_mov_b32_e32 v2
|
||||
; NOPRT: image_load_mip v[2:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
|
||||
define amdgpu_ps float @load_mip_2d_tfe_dmask0(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 0, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
%vv = bitcast i32 %v.err to float
|
||||
ret float %vv
|
||||
}
|
||||
|
||||
; Do not make dmask 0 even if no result (other than tfe) is used.
|
||||
; GCN-LABEL: {{^}}load_mip_2d_tfe_nouse:
|
||||
; GCN: v_mov_b32_e32 v3, 0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v3
|
||||
; PRT: image_load_mip v[3:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
|
||||
; NOPRT-NOT: v_mov_b32_e32 v2
|
||||
; NOPRT: image_load_mip v[2:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
|
||||
define amdgpu_ps float @load_mip_2d_tfe_nouse(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32 15, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
%vv = bitcast i32 %v.err to float
|
||||
ret float %vv
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_mip_2d_tfe_nouse_V2:
|
||||
; GCN: v_mov_b32_e32 v3, 0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v3
|
||||
; PRT: image_load_mip v[3:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
|
||||
; NOPRT-NOT: v_mov_b32_e32 v2
|
||||
; NOPRT: image_load_mip v[2:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x1 unorm tfe{{$}}
|
||||
define amdgpu_ps float @load_mip_2d_tfe_nouse_V2(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
|
||||
main_body:
|
||||
%v = call {<2 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v2f32i32.i32(i32 6, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.err = extractvalue {<2 x float>, i32} %v, 1
|
||||
%vv = bitcast i32 %v.err to float
|
||||
ret float %vv
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_mip_2d_tfe_nouse_V1:
|
||||
; GCN: v_mov_b32_e32 v3, 0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v3
|
||||
; PRT: image_load_mip v[3:4], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x2 unorm tfe{{$}}
|
||||
; NOPRT-NOT: v_mov_b32_e32 v2
|
||||
; NOPRT: image_load_mip v[2:3], v[{{[0-9]+:[0-9]+}}], s[0:7] dmask:0x2 unorm tfe{{$}}
|
||||
define amdgpu_ps float @load_mip_2d_tfe_nouse_V1(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %mip) {
|
||||
main_body:
|
||||
%v = call {float, i32} @llvm.amdgcn.image.load.mip.2d.f32i32.i32(i32 2, i32 %s, i32 %t, i32 %mip, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.err = extractvalue {float, i32} %v, 1
|
||||
%vv = bitcast i32 %v.err to float
|
||||
ret float %vv
|
||||
}
|
||||
|
||||
; Check for dmask being materially smaller than return type
|
||||
; GCN-LABEL: {{^}}load_1d_tfe_V4_dmask3:
|
||||
; PRT: v_mov_b32_e32 v0, 0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; NOPRT: v_mov_b32_e32 v3, 0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v1
|
||||
; NOPRT-NOT: v_mov_b32_e32 v2
|
||||
; GCN: image_load v[0:3], v{{[0-9]+}}, s[0:7] dmask:0x7 unorm tfe{{$}}
|
||||
; SIVI: buffer_store_dword v3, off, s[8:11], 0
|
||||
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v3
|
||||
define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask3(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 7, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_1d_tfe_V4_dmask2:
|
||||
; PRT: v_mov_b32_e32 v0, 0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; NOPRT: v_mov_b32_e32 v2, 0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v1
|
||||
; GCN: image_load v[0:3], v{{[0-9]+}}, s[0:7] dmask:0x6 unorm tfe{{$}}
|
||||
; SIVI: buffer_store_dword v2, off, s[8:11], 0
|
||||
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v2
|
||||
define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask2(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 6, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_1d_tfe_V4_dmask1:
|
||||
; PRT: v_mov_b32_e32 v0, 0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; NOPRT: v_mov_b32_e32 v1, 0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v0
|
||||
; GCN: image_load v[0:1], v{{[0-9]+}}, s[0:7] dmask:0x8 unorm tfe{{$}}
|
||||
; SIVI: buffer_store_dword v1, off, s[8:11], 0
|
||||
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v1
|
||||
define amdgpu_ps <4 x float> @load_1d_tfe_V4_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_1d_tfe_V2_dmask1:
|
||||
; PRT: v_mov_b32_e32 v0, 0
|
||||
; PRT-DAG: v_mov_b32_e32 v{{[0-9]+}}, v0
|
||||
; NOPRT: v_mov_b32_e32 v1, 0
|
||||
; NOPRT-NOT: v_mov_b32_e32 v0
|
||||
; GCN: image_load v[0:1], v{{[0-9]+}}, s[0:7] dmask:0x8 unorm tfe{{$}}
|
||||
; SIVI: buffer_store_dword v1, off, s[8:11], 0
|
||||
; GFX900: global_store_dword v[{{[0-9]+:[0-9]+}}], v1
|
||||
define amdgpu_ps <2 x float> @load_1d_tfe_V2_dmask1(<8 x i32> inreg %rsrc, i32 addrspace(1)* inreg %out, i32 %s) {
|
||||
main_body:
|
||||
%v = call {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32 8, i32 %s, <8 x i32> %rsrc, i32 1, i32 0)
|
||||
%v.vec = extractvalue {<2 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<2 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <2 x float> %v.vec
|
||||
}
|
||||
|
||||
|
||||
; GCN-LABEL: {{^}}load_mip_3d:
|
||||
; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm{{$}}
|
||||
define amdgpu_ps <4 x float> @load_mip_3d(<8 x i32> inreg %rsrc, i32 %s, i32 %t, i32 %r, i32 %mip) {
|
||||
@ -820,37 +404,23 @@ define amdgpu_ps float @image_load_mmo(<8 x i32> inreg %rsrc, float addrspace(3)
|
||||
store float 0.000000e+00, float addrspace(3)* %lds
|
||||
%c0 = extractelement <2 x i32> %c, i32 0
|
||||
%c1 = extractelement <2 x i32> %c, i32 1
|
||||
%tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 1, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
%tex = call float @llvm.amdgcn.image.load.2d.f32.i32(i32 15, i32 %c0, i32 %c1, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
%tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
|
||||
store float 0.000000e+00, float addrspace(3)* %tmp2
|
||||
ret float %tex
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {float,i32} @llvm.amdgcn.image.load.1d.f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {<2 x float>,i32} @llvm.amdgcn.image.load.1d.v2f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {<4 x float>,i32} @llvm.amdgcn.image.load.1d.v4f32i32.i32(i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {<4 x float>,i32} @llvm.amdgcn.image.load.2d.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.load.3d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {<4 x float>,i32} @llvm.amdgcn.image.load.3d.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.load.cube.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {<4 x float>,i32} @llvm.amdgcn.image.load.cube.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.load.1darray.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {<4 x float>,i32} @llvm.amdgcn.image.load.1darray.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.load.2darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {<4 x float>,i32} @llvm.amdgcn.image.load.2darray.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.load.2dmsaa.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {<4 x float>,i32} @llvm.amdgcn.image.load.2dmsaa.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.load.2darraymsaa.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {<4 x float>,i32} @llvm.amdgcn.image.load.2darraymsaa.v4f32i32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
|
||||
declare <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {<4 x float>,i32} @llvm.amdgcn.image.load.mip.1d.v4f32i32.i32(i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {<4 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v4f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {<2 x float>,i32} @llvm.amdgcn.image.load.mip.2d.v2f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare {float,i32} @llvm.amdgcn.image.load.mip.2d.f32i32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.load.mip.3d.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.load.mip.cube.v4f32.i32(i32, i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.load.mip.1darray.v4f32.i32(i32, i32, i32, i32, <8 x i32>, i32, i32) #1
|
||||
|
@ -1,7 +1,7 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
|
||||
|
||||
; GCN-LABEL: {{^}}load.f16.1d:
|
||||
; GCN: image_load v0, v0, s[0:7] dmask:0x1 unorm a16 d16
|
||||
; GCN: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm a16 d16
|
||||
define amdgpu_ps <4 x half> @load.f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
|
||||
main_body:
|
||||
%x = extractelement <2 x i16> %coords, i32 0
|
||||
@ -10,7 +10,7 @@ main_body:
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load.v2f16.1d:
|
||||
; GCN: image_load v0, v0, s[0:7] dmask:0x3 unorm a16 d16
|
||||
; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16 d16
|
||||
define amdgpu_ps <4 x half> @load.v2f16.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
|
||||
main_body:
|
||||
%x = extractelement <2 x i16> %coords, i32 0
|
||||
@ -37,7 +37,7 @@ main_body:
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load.f16.2d:
|
||||
; GCN: image_load v0, v0, s[0:7] dmask:0x1 unorm a16 d16
|
||||
; GCN: image_load v[0:1], v0, s[0:7] dmask:0x1 unorm a16 d16
|
||||
define amdgpu_ps <4 x half> @load.f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
|
||||
main_body:
|
||||
%x = extractelement <2 x i16> %coords, i32 0
|
||||
@ -47,7 +47,7 @@ main_body:
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load.v2f16.2d:
|
||||
; GCN: image_load v0, v0, s[0:7] dmask:0x3 unorm a16 d16
|
||||
; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16 d16
|
||||
define amdgpu_ps <4 x half> @load.v2f16.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
|
||||
main_body:
|
||||
%x = extractelement <2 x i16> %coords, i32 0
|
||||
@ -77,7 +77,7 @@ main_body:
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load.f16.3d:
|
||||
; GCN: image_load v0, v[0:1], s[0:7] dmask:0x1 unorm a16 d16
|
||||
; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x1 unorm a16 d16
|
||||
define amdgpu_ps <4 x half> @load.f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
|
||||
main_body:
|
||||
%x = extractelement <2 x i16> %coords_lo, i32 0
|
||||
@ -88,7 +88,7 @@ main_body:
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load.v2f16.3d:
|
||||
; GCN: image_load v0, v[0:1], s[0:7] dmask:0x3 unorm a16 d16
|
||||
; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x3 unorm a16 d16
|
||||
define amdgpu_ps <4 x half> @load.v2f16.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
|
||||
main_body:
|
||||
%x = extractelement <2 x i16> %coords_lo, i32 0
|
||||
|
@ -1,7 +1,7 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
|
||||
|
||||
; GCN-LABEL: {{^}}load.f32.1d:
|
||||
; GCN: image_load v0, v0, s[0:7] dmask:0x1 unorm a16
|
||||
; GCN: image_load v[0:3], v0, s[0:7] dmask:0x1 unorm a16
|
||||
define amdgpu_ps <4 x float> @load.f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
|
||||
main_body:
|
||||
%x = extractelement <2 x i16> %coords, i32 0
|
||||
@ -10,7 +10,7 @@ main_body:
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load.v2f32.1d:
|
||||
; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16
|
||||
; GCN: image_load v[0:3], v0, s[0:7] dmask:0x3 unorm a16
|
||||
define amdgpu_ps <4 x float> @load.v2f32.1d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
|
||||
main_body:
|
||||
%x = extractelement <2 x i16> %coords, i32 0
|
||||
@ -37,7 +37,7 @@ main_body:
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load.f32.2d:
|
||||
; GCN: image_load v0, v0, s[0:7] dmask:0x1 unorm a16
|
||||
; GCN: image_load v[0:3], v0, s[0:7] dmask:0x1 unorm a16
|
||||
define amdgpu_ps <4 x float> @load.f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
|
||||
main_body:
|
||||
%x = extractelement <2 x i16> %coords, i32 0
|
||||
@ -47,7 +47,7 @@ main_body:
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load.v2f32.2d:
|
||||
; GCN: image_load v[0:1], v0, s[0:7] dmask:0x3 unorm a16
|
||||
; GCN: image_load v[0:3], v0, s[0:7] dmask:0x3 unorm a16
|
||||
define amdgpu_ps <4 x float> @load.v2f32.2d(<8 x i32> inreg %rsrc, <2 x i16> %coords) {
|
||||
main_body:
|
||||
%x = extractelement <2 x i16> %coords, i32 0
|
||||
@ -77,7 +77,7 @@ main_body:
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load.f32.3d:
|
||||
; GCN: image_load v0, v[0:1], s[0:7] dmask:0x1 unorm a16
|
||||
; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x1 unorm a16
|
||||
define amdgpu_ps <4 x float> @load.f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
|
||||
main_body:
|
||||
%x = extractelement <2 x i16> %coords_lo, i32 0
|
||||
@ -88,7 +88,7 @@ main_body:
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load.v2f32.3d:
|
||||
; GCN: image_load v[0:1], v[0:1], s[0:7] dmask:0x3 unorm a16
|
||||
; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0x3 unorm a16
|
||||
define amdgpu_ps <4 x float> @load.v2f32.3d(<8 x i32> inreg %rsrc, <2 x i16> %coords_lo, <2 x i16> %coords_hi) {
|
||||
main_body:
|
||||
%x = extractelement <2 x i16> %coords_lo, i32 0
|
||||
|
@ -10,19 +10,6 @@ main_body:
|
||||
ret half %tex
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}image_sample_2d_f16_tfe:
|
||||
; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0
|
||||
; PACKED: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16{{$}}
|
||||
; UNPACKED: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 tfe d16{{$}}
|
||||
define amdgpu_ps half @image_sample_2d_f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t, i32 addrspace(1)* inreg %out) {
|
||||
main_body:
|
||||
%tex = call {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32 1, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
|
||||
%tex.vec = extractvalue {half, i32} %tex, 0
|
||||
%tex.err = extractvalue {half, i32} %tex, 1
|
||||
store i32 %tex.err, i32 addrspace(1)* %out, align 4
|
||||
ret half %tex.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}image_sample_c_d_1d_v2f16:
|
||||
; UNPACKED: image_sample_c_d v[0:1], v[0:3], s[0:7], s[8:11] dmask:0x3 d16{{$}}
|
||||
; PACKED: image_sample_c_d v0, v[0:3], s[0:7], s[8:11] dmask:0x3 d16{{$}}
|
||||
@ -33,22 +20,6 @@ main_body:
|
||||
ret float %r
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}image_sample_c_d_1d_v2f16_tfe:
|
||||
; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0
|
||||
; UNPACKED: image_sample_c_d v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16{{$}}
|
||||
; PACKED: image_sample_c_d v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0x3 tfe d16{{$}}
|
||||
define amdgpu_ps <2 x float> @image_sample_c_d_1d_v2f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %zcompare, float %dsdh, float %dsdv, float %s) {
|
||||
main_body:
|
||||
%tex = call {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
|
||||
%tex.vec = extractvalue {<2 x half>, i32} %tex, 0
|
||||
%tex.err = extractvalue {<2 x half>, i32} %tex, 1
|
||||
%tex.vecf = bitcast <2 x half> %tex.vec to float
|
||||
%r.0 = insertelement <2 x float> undef, float %tex.vecf, i32 0
|
||||
%tex.errf = bitcast i32 %tex.err to float
|
||||
%r = insertelement <2 x float> %r.0, float %tex.errf, i32 1
|
||||
ret <2 x float> %r
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}image_sample_b_2d_v4f16:
|
||||
; UNPACKED: image_sample_b v[0:3], v[0:3], s[0:7], s[8:11] dmask:0xf d16{{$}}
|
||||
; PACKED: image_sample_b v[0:1], v[0:3], s[0:7], s[8:11] dmask:0xf d16{{$}}
|
||||
@ -59,33 +30,9 @@ main_body:
|
||||
ret <2 x float> %r
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}image_sample_b_2d_v4f16_tfe:
|
||||
; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0
|
||||
; UNPACKED: image_sample_b v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0xf tfe d16{{$}}
|
||||
; PACKED: image_sample_b v[{{[0-9]+:[0-9]+}}], v[0:3], s[0:7], s[8:11] dmask:0xf tfe d16{{$}}
|
||||
define amdgpu_ps <4 x float> @image_sample_b_2d_v4f16_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %bias, float %s, float %t) {
|
||||
main_body:
|
||||
%tex = call {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0)
|
||||
%tex.vec = extractvalue {<4 x half>, i32} %tex, 0
|
||||
%tex.err = extractvalue {<4 x half>, i32} %tex, 1
|
||||
%tex.vecf = bitcast <4 x half> %tex.vec to <2 x float>
|
||||
%tex.vecf.0 = extractelement <2 x float> %tex.vecf, i32 0
|
||||
%tex.vecf.1 = extractelement <2 x float> %tex.vecf, i32 1
|
||||
%r.0 = insertelement <4 x float> undef, float %tex.vecf.0, i32 0
|
||||
%r.1 = insertelement <4 x float> %r.0, float %tex.vecf.1, i32 1
|
||||
%tex.errf = bitcast i32 %tex.err to float
|
||||
%r = insertelement <4 x float> %r.1, float %tex.errf, i32 2
|
||||
ret <4 x float> %r
|
||||
}
|
||||
|
||||
declare half @llvm.amdgcn.image.sample.2d.f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
declare {half,i32} @llvm.amdgcn.image.sample.2d.f16i32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
declare <4 x half> @llvm.amdgcn.image.sample.2d.v4f16.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
declare {<2 x half>,i32} @llvm.amdgcn.image.sample.2d.v2f16i32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
declare <2 x half> @llvm.amdgcn.image.sample.c.d.1d.v2f16.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
declare {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
declare <4 x half> @llvm.amdgcn.image.sample.b.2d.v4f16.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
declare {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readonly }
|
||||
|
@ -9,162 +9,6 @@ main_body:
|
||||
ret <4 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sample_1d_tfe:
|
||||
; GCN: v_mov_b32_e32 v0, 0
|
||||
; GCN: v_mov_b32_e32 v1, v0
|
||||
; GCN: v_mov_b32_e32 v2, v0
|
||||
; GCN: v_mov_b32_e32 v3, v0
|
||||
; GCN: v_mov_b32_e32 v4, v0
|
||||
; GCN: image_sample v[0:7], v5, s[0:7], s[8:11] dmask:0xf tfe{{$}}
|
||||
define amdgpu_ps <4 x float> @sample_1d_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_1:
|
||||
; GCN: v_mov_b32_e32 v0, 0
|
||||
; GCN: v_mov_b32_e32 v1, v0
|
||||
; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x1 tfe{{$}}
|
||||
define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
|
||||
%res.vec = extractvalue {<4 x float>,i32} %v, 0
|
||||
%res.f = extractelement <4 x float> %res.vec, i32 0
|
||||
%res.err = extractvalue {<4 x float>,i32} %v, 1
|
||||
%res.errf = bitcast i32 %res.err to float
|
||||
%res.tmp = insertelement <2 x float> undef, float %res.f, i32 0
|
||||
%res = insertelement <2 x float> %res.tmp, float %res.errf, i32 1
|
||||
ret <2 x float> %res
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_2:
|
||||
; GCN: v_mov_b32_e32 v0, 0
|
||||
; GCN: v_mov_b32_e32 v1, v0
|
||||
; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x2 tfe{{$}}
|
||||
define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
|
||||
%res.vec = extractvalue {<4 x float>,i32} %v, 0
|
||||
%res.f = extractelement <4 x float> %res.vec, i32 1
|
||||
%res.err = extractvalue {<4 x float>,i32} %v, 1
|
||||
%res.errf = bitcast i32 %res.err to float
|
||||
%res.tmp = insertelement <2 x float> undef, float %res.f, i32 0
|
||||
%res = insertelement <2 x float> %res.tmp, float %res.errf, i32 1
|
||||
ret <2 x float> %res
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_3:
|
||||
; GCN: v_mov_b32_e32 v0, 0
|
||||
; GCN: v_mov_b32_e32 v1, v0
|
||||
; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x4 tfe{{$}}
|
||||
define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
|
||||
%res.vec = extractvalue {<4 x float>,i32} %v, 0
|
||||
%res.f = extractelement <4 x float> %res.vec, i32 2
|
||||
%res.err = extractvalue {<4 x float>,i32} %v, 1
|
||||
%res.errf = bitcast i32 %res.err to float
|
||||
%res.tmp = insertelement <2 x float> undef, float %res.f, i32 0
|
||||
%res = insertelement <2 x float> %res.tmp, float %res.errf, i32 1
|
||||
ret <2 x float> %res
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_4:
|
||||
; GCN: v_mov_b32_e32 v0, 0
|
||||
; GCN: v_mov_b32_e32 v1, v0
|
||||
; GCN: image_sample v[0:1], v2, s[0:7], s[8:11] dmask:0x8 tfe{{$}}
|
||||
define amdgpu_ps <2 x float> @sample_1d_tfe_adjust_writemask_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
|
||||
%res.vec = extractvalue {<4 x float>,i32} %v, 0
|
||||
%res.f = extractelement <4 x float> %res.vec, i32 3
|
||||
%res.err = extractvalue {<4 x float>,i32} %v, 1
|
||||
%res.errf = bitcast i32 %res.err to float
|
||||
%res.tmp = insertelement <2 x float> undef, float %res.f, i32 0
|
||||
%res = insertelement <2 x float> %res.tmp, float %res.errf, i32 1
|
||||
ret <2 x float> %res
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_12:
|
||||
; GCN: v_mov_b32_e32 v0, 0
|
||||
; GCN: v_mov_b32_e32 v1, v0
|
||||
; GCN: v_mov_b32_e32 v2, v0
|
||||
; GCN: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 tfe{{$}}
|
||||
define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_12(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
|
||||
%res.vec = extractvalue {<4 x float>,i32} %v, 0
|
||||
%res.f1 = extractelement <4 x float> %res.vec, i32 0
|
||||
%res.f2 = extractelement <4 x float> %res.vec, i32 1
|
||||
%res.err = extractvalue {<4 x float>,i32} %v, 1
|
||||
%res.errf = bitcast i32 %res.err to float
|
||||
%res.tmp1 = insertelement <4 x float> undef, float %res.f1, i32 0
|
||||
%res.tmp2 = insertelement <4 x float> %res.tmp1, float %res.f2, i32 1
|
||||
%res = insertelement <4 x float> %res.tmp2, float %res.errf, i32 2
|
||||
ret <4 x float> %res
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_24:
|
||||
; GCN: v_mov_b32_e32 v0, 0
|
||||
; GCN: v_mov_b32_e32 v1, v0
|
||||
; GCN: v_mov_b32_e32 v2, v0
|
||||
; GCN: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa tfe{{$}}
|
||||
define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_24(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
|
||||
%res.vec = extractvalue {<4 x float>,i32} %v, 0
|
||||
%res.f1 = extractelement <4 x float> %res.vec, i32 1
|
||||
%res.f2 = extractelement <4 x float> %res.vec, i32 3
|
||||
%res.err = extractvalue {<4 x float>,i32} %v, 1
|
||||
%res.errf = bitcast i32 %res.err to float
|
||||
%res.tmp1 = insertelement <4 x float> undef, float %res.f1, i32 0
|
||||
%res.tmp2 = insertelement <4 x float> %res.tmp1, float %res.f2, i32 1
|
||||
%res = insertelement <4 x float> %res.tmp2, float %res.errf, i32 2
|
||||
ret <4 x float> %res
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sample_1d_tfe_adjust_writemask_134:
|
||||
; GCN: v_mov_b32_e32 v0, 0
|
||||
; GCN: v_mov_b32_e32 v1, v0
|
||||
; GCN: v_mov_b32_e32 v2, v0
|
||||
; GCN: v_mov_b32_e32 v3, v0
|
||||
; GCN: image_sample v[0:3], v4, s[0:7], s[8:11] dmask:0xd tfe{{$}}
|
||||
define amdgpu_ps <4 x float> @sample_1d_tfe_adjust_writemask_134(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
|
||||
%res.vec = extractvalue {<4 x float>,i32} %v, 0
|
||||
%res.f1 = extractelement <4 x float> %res.vec, i32 0
|
||||
%res.f2 = extractelement <4 x float> %res.vec, i32 2
|
||||
%res.f3 = extractelement <4 x float> %res.vec, i32 3
|
||||
%res.err = extractvalue {<4 x float>,i32} %v, 1
|
||||
%res.errf = bitcast i32 %res.err to float
|
||||
%res.tmp1 = insertelement <4 x float> undef, float %res.f1, i32 0
|
||||
%res.tmp2 = insertelement <4 x float> %res.tmp1, float %res.f2, i32 1
|
||||
%res.tmp3 = insertelement <4 x float> %res.tmp2, float %res.f3, i32 2
|
||||
%res = insertelement <4 x float> %res.tmp3, float %res.errf, i32 3
|
||||
ret <4 x float> %res
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sample_1d_lwe:
|
||||
; GCN: v_mov_b32_e32 v0, 0
|
||||
; GCN: v_mov_b32_e32 v1, v0
|
||||
; GCN: v_mov_b32_e32 v2, v0
|
||||
; GCN: v_mov_b32_e32 v3, v0
|
||||
; GCN: v_mov_b32_e32 v4, v0
|
||||
; GCN: image_sample v[0:7], v5, s[0:7], s[8:11] dmask:0xf lwe{{$}}
|
||||
define amdgpu_ps <4 x float> @sample_1d_lwe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 addrspace(1)* inreg %out, float %s) {
|
||||
main_body:
|
||||
%v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 2, i32 0)
|
||||
%v.vec = extractvalue {<4 x float>, i32} %v, 0
|
||||
%v.err = extractvalue {<4 x float>, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret <4 x float> %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sample_2d:
|
||||
; GCN: image_sample v[0:3], v[0:1], s[0:7], s[8:11] dmask:0xf{{$}}
|
||||
define amdgpu_ps <4 x float> @sample_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) {
|
||||
@ -517,17 +361,6 @@ main_body:
|
||||
ret float %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sample_c_d_o_2darray_V1_tfe:
|
||||
; GCN: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 tfe da{{$}}
|
||||
define amdgpu_ps float @sample_c_d_o_2darray_V1_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, i32 addrspace(1)* inreg %out) {
|
||||
main_body:
|
||||
%v = call {float,i32} @llvm.amdgcn.image.sample.c.d.o.2darray.f32i32.f32.f32(i32 4, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
|
||||
%v.vec = extractvalue {float, i32} %v, 0
|
||||
%v.err = extractvalue {float, i32} %v, 1
|
||||
store i32 %v.err, i32 addrspace(1)* %out, align 4
|
||||
ret float %v.vec
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sample_c_d_o_2darray_V2:
|
||||
; GCN: image_sample_c_d_o v[0:1], v[0:15], s[0:7], s[8:11] dmask:0x6 da{{$}}
|
||||
define amdgpu_ps <2 x float> @sample_c_d_o_2darray_V2(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) {
|
||||
@ -536,22 +369,6 @@ main_body:
|
||||
ret <2 x float> %v
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sample_c_d_o_2darray_V2_tfe:
|
||||
; GCN: image_sample_c_d_o v[9:12], v[0:15], s[0:7], s[8:11] dmask:0x6 tfe da{{$}}
|
||||
define amdgpu_ps <4 x float> @sample_c_d_o_2darray_V2_tfe(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice) {
|
||||
main_body:
|
||||
%v = call {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0)
|
||||
%v.vec = extractvalue {<2 x float>, i32} %v, 0
|
||||
%v.f1 = extractelement <2 x float> %v.vec, i32 0
|
||||
%v.f2 = extractelement <2 x float> %v.vec, i32 1
|
||||
%v.err = extractvalue {<2 x float>, i32} %v, 1
|
||||
%v.errf = bitcast i32 %v.err to float
|
||||
%res.0 = insertelement <4 x float> undef, float %v.f1, i32 0
|
||||
%res.1 = insertelement <4 x float> %res.0, float %v.f2, i32 1
|
||||
%res.2 = insertelement <4 x float> %res.1, float %v.errf, i32 2
|
||||
ret <4 x float> %res.2
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}sample_1d_unorm:
|
||||
; GCN: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf unorm{{$}}
|
||||
define amdgpu_ps <4 x float> @sample_1d_unorm(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
|
||||
@ -674,7 +491,6 @@ main_body:
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
declare {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.sample.cube.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
@ -726,9 +542,7 @@ declare <4 x float> @llvm.amdgcn.image.sample.c.lz.1d.v4f32.f32(i32, float, floa
|
||||
declare <4 x float> @llvm.amdgcn.image.sample.c.lz.2d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
|
||||
declare float @llvm.amdgcn.image.sample.c.d.o.2darray.f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
declare {float, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.f32i32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
declare <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
declare {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32, i32, float, float, float, float, float, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readonly }
|
||||
|
@ -328,28 +328,6 @@ define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32(float %vaddr, <8
|
||||
ret float %elt0
|
||||
}
|
||||
|
||||
; Check that the intrinsic remains unchanged in the presence of TFE or LWE
|
||||
; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_tfe(
|
||||
; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0)
|
||||
; CHECK: ret float %elt0
|
||||
define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_tfe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
|
||||
%data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 1, i32 0)
|
||||
%data.vec = extractvalue {<4 x float>,i32} %data, 0
|
||||
%elt0 = extractelement <4 x float> %data.vec, i32 0
|
||||
ret float %elt0
|
||||
}
|
||||
|
||||
; Check that the intrinsic remains unchanged in the presence of TFE or LWE
|
||||
; CHECK-LABEL: @extract_elt0_image_sample_1d_v4f32_f32_lwe(
|
||||
; CHECK-NEXT: %data = call { <4 x float>, i32 } @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0)
|
||||
; CHECK: ret float %elt0
|
||||
define amdgpu_ps float @extract_elt0_image_sample_1d_v4f32_f32_lwe(float %vaddr, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
|
||||
%data = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32 15, float %vaddr, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 2, i32 0)
|
||||
%data.vec = extractvalue {<4 x float>,i32} %data, 0
|
||||
%elt0 = extractelement <4 x float> %data.vec, i32 0
|
||||
ret float %elt0
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt0_image_sample_2d_v4f32_f32(
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.image.sample.2d.f32.f32(i32 1, float %s, float %t, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
@ -528,7 +506,6 @@ define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_dmask_1111_image_sample_1d_
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.amdgcn.image.sample.1d.v4f32.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
declare {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.sl_v4f32i32s.f32(i32, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.sample.2d.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.sample.3d.v4f32.f32(i32, float, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.image.sample.1darray.v4f32.f32(i32, float, float, <8 x i32>, <4 x i32>, i1, i32, i32) #1
|
||||
|
Loading…
x
Reference in New Issue
Block a user