mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 11:13:28 +01:00
[NVPTX] Added support for .f16x2 instructions.
This patch enables support for .f16x2 operations. Added new register type Float16x2. Added support for .f16x2 instructions. Added handling of vectorized loads/stores of v2f16 values. Differential Revision: https://reviews.llvm.org/D30057 Differential Revision: https://reviews.llvm.org/D30310 llvm-svn: 296032
This commit is contained in:
parent
26bd3a9606
commit
828acceeb5
@ -64,6 +64,9 @@ void NVPTXInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
|
||||
case 7:
|
||||
OS << "%h";
|
||||
break;
|
||||
case 8:
|
||||
OS << "%hh";
|
||||
break;
|
||||
}
|
||||
|
||||
unsigned VReg = RegNo & 0x0FFFFFFF;
|
||||
|
@ -363,6 +363,8 @@ unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) {
|
||||
Ret = (6 << 28);
|
||||
} else if (RC == &NVPTX::Float16RegsRegClass) {
|
||||
Ret = (7 << 28);
|
||||
} else if (RC == &NVPTX::Float16x2RegsRegClass) {
|
||||
Ret = (8 << 28);
|
||||
} else {
|
||||
report_fatal_error("Bad register class");
|
||||
}
|
||||
|
@ -84,6 +84,14 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
|
||||
if (tryStore(N))
|
||||
return;
|
||||
break;
|
||||
case ISD::EXTRACT_VECTOR_ELT:
|
||||
if (tryEXTRACT_VECTOR_ELEMENT(N))
|
||||
return;
|
||||
break;
|
||||
case NVPTXISD::SETP_F16X2:
|
||||
SelectSETP_F16X2(N);
|
||||
return;
|
||||
|
||||
case NVPTXISD::LoadV2:
|
||||
case NVPTXISD::LoadV4:
|
||||
if (tryLoadVector(N))
|
||||
@ -516,6 +524,127 @@ bool NVPTXDAGToDAGISel::tryConstantFP16(SDNode *N) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Map ISD:CONDCODE value to appropriate CmpMode expected by
|
||||
// NVPTXInstPrinter::printCmpMode()
|
||||
static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
|
||||
using NVPTX::PTXCmpMode::CmpMode;
|
||||
unsigned PTXCmpMode = [](ISD::CondCode CC) {
|
||||
switch (CC) {
|
||||
default:
|
||||
llvm_unreachable("Unexpected condition code.");
|
||||
case ISD::SETOEQ:
|
||||
return CmpMode::EQ;
|
||||
case ISD::SETOGT:
|
||||
return CmpMode::GT;
|
||||
case ISD::SETOGE:
|
||||
return CmpMode::GE;
|
||||
case ISD::SETOLT:
|
||||
return CmpMode::LT;
|
||||
case ISD::SETOLE:
|
||||
return CmpMode::LE;
|
||||
case ISD::SETONE:
|
||||
return CmpMode::NE;
|
||||
case ISD::SETO:
|
||||
return CmpMode::NUM;
|
||||
case ISD::SETUO:
|
||||
return CmpMode::NotANumber;
|
||||
case ISD::SETUEQ:
|
||||
return CmpMode::EQU;
|
||||
case ISD::SETUGT:
|
||||
return CmpMode::GTU;
|
||||
case ISD::SETUGE:
|
||||
return CmpMode::GEU;
|
||||
case ISD::SETULT:
|
||||
return CmpMode::LTU;
|
||||
case ISD::SETULE:
|
||||
return CmpMode::LEU;
|
||||
case ISD::SETUNE:
|
||||
return CmpMode::NEU;
|
||||
case ISD::SETEQ:
|
||||
return CmpMode::EQ;
|
||||
case ISD::SETGT:
|
||||
return CmpMode::GT;
|
||||
case ISD::SETGE:
|
||||
return CmpMode::GE;
|
||||
case ISD::SETLT:
|
||||
return CmpMode::LT;
|
||||
case ISD::SETLE:
|
||||
return CmpMode::LE;
|
||||
case ISD::SETNE:
|
||||
return CmpMode::NE;
|
||||
}
|
||||
}(CondCode.get());
|
||||
|
||||
if (FTZ)
|
||||
PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
|
||||
|
||||
return PTXCmpMode;
|
||||
}
|
||||
|
||||
bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
|
||||
unsigned PTXCmpMode =
|
||||
getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
|
||||
SDLoc DL(N);
|
||||
SDNode *SetP = CurDAG->getMachineNode(
|
||||
NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
|
||||
N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
|
||||
ReplaceNode(N, SetP);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Find all instances of extract_vector_elt that use this v2f16 vector
|
||||
// and coalesce them into a scattering move instruction.
|
||||
bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
|
||||
SDValue Vector = N->getOperand(0);
|
||||
|
||||
// We only care about f16x2 as it's the only real vector type we
|
||||
// need to deal with.
|
||||
if (Vector.getSimpleValueType() != MVT::v2f16)
|
||||
return false;
|
||||
|
||||
// Find and record all uses of this vector that extract element 0 or 1.
|
||||
SmallVector<SDNode *, 4> E0, E1;
|
||||
for (const auto &U : Vector.getNode()->uses()) {
|
||||
if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
|
||||
continue;
|
||||
if (U->getOperand(0) != Vector)
|
||||
continue;
|
||||
if (const ConstantSDNode *IdxConst =
|
||||
dyn_cast<ConstantSDNode>(U->getOperand(1))) {
|
||||
if (IdxConst->getZExtValue() == 0)
|
||||
E0.push_back(U);
|
||||
else if (IdxConst->getZExtValue() == 1)
|
||||
E1.push_back(U);
|
||||
else
|
||||
llvm_unreachable("Invalid vector index.");
|
||||
}
|
||||
}
|
||||
|
||||
// There's no point scattering f16x2 if we only ever access one
|
||||
// element of it.
|
||||
if (E0.empty() || E1.empty())
|
||||
return false;
|
||||
|
||||
unsigned Op = NVPTX::SplitF16x2;
|
||||
// If the vector has been BITCAST'ed from i32, we can use original
|
||||
// value directly and avoid register-to-register move.
|
||||
SDValue Source = Vector;
|
||||
if (Vector->getOpcode() == ISD::BITCAST) {
|
||||
Op = NVPTX::SplitI32toF16x2;
|
||||
Source = Vector->getOperand(0);
|
||||
}
|
||||
// Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
|
||||
// into f16,f16 SplitF16x2(V)
|
||||
SDNode *ScatterOp =
|
||||
CurDAG->getMachineNode(Op, SDLoc(N), MVT::f16, MVT::f16, Source);
|
||||
for (auto *Node : E0)
|
||||
ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
|
||||
for (auto *Node : E1)
|
||||
ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static unsigned int getCodeAddrSpace(MemSDNode *N) {
|
||||
const Value *Src = N->getMemOperand()->getValue();
|
||||
|
||||
@ -689,29 +818,26 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
|
||||
codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
|
||||
isVolatile = false;
|
||||
|
||||
// Vector Setting
|
||||
MVT SimpleVT = LoadedVT.getSimpleVT();
|
||||
unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
|
||||
if (SimpleVT.isVector()) {
|
||||
unsigned num = SimpleVT.getVectorNumElements();
|
||||
if (num == 2)
|
||||
vecType = NVPTX::PTXLdStInstCode::V2;
|
||||
else if (num == 4)
|
||||
vecType = NVPTX::PTXLdStInstCode::V4;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
// Type Setting: fromType + fromTypeWidth
|
||||
//
|
||||
// Sign : ISD::SEXTLOAD
|
||||
// Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
|
||||
// type is integer
|
||||
// Float : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
|
||||
MVT SimpleVT = LoadedVT.getSimpleVT();
|
||||
MVT ScalarVT = SimpleVT.getScalarType();
|
||||
// Read at least 8 bits (predicates are stored as 8-bit values)
|
||||
unsigned fromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
|
||||
unsigned int fromType;
|
||||
|
||||
// Vector Setting
|
||||
unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
|
||||
if (SimpleVT.isVector()) {
|
||||
assert(LoadedVT == MVT::v2f16 && "Unexpected vector type");
|
||||
// v2f16 is loaded using ld.b32
|
||||
fromTypeWidth = 32;
|
||||
}
|
||||
|
||||
if ((LD->getExtensionType() == ISD::SEXTLOAD))
|
||||
fromType = NVPTX::PTXLdStInstCode::Signed;
|
||||
else if (ScalarVT.isFloatingPoint())
|
||||
@ -746,6 +872,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::LD_f16_avar;
|
||||
break;
|
||||
case MVT::v2f16:
|
||||
Opcode = NVPTX::LD_f16x2_avar;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::LD_f32_avar;
|
||||
break;
|
||||
@ -777,6 +906,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::LD_f16_asi;
|
||||
break;
|
||||
case MVT::v2f16:
|
||||
Opcode = NVPTX::LD_f16x2_asi;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::LD_f32_asi;
|
||||
break;
|
||||
@ -809,6 +941,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::LD_f16_ari_64;
|
||||
break;
|
||||
case MVT::v2f16:
|
||||
Opcode = NVPTX::LD_f16x2_ari_64;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::LD_f32_ari_64;
|
||||
break;
|
||||
@ -835,6 +970,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::LD_f16_ari;
|
||||
break;
|
||||
case MVT::v2f16:
|
||||
Opcode = NVPTX::LD_f16x2_ari;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::LD_f32_ari;
|
||||
break;
|
||||
@ -867,6 +1005,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::LD_f16_areg_64;
|
||||
break;
|
||||
case MVT::v2f16:
|
||||
Opcode = NVPTX::LD_f16x2_areg_64;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::LD_f32_areg_64;
|
||||
break;
|
||||
@ -893,6 +1034,9 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::LD_f16_areg;
|
||||
break;
|
||||
case MVT::v2f16:
|
||||
Opcode = NVPTX::LD_f16x2_areg;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::LD_f32_areg;
|
||||
break;
|
||||
@ -968,7 +1112,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
|
||||
if (ExtensionType == ISD::SEXTLOAD)
|
||||
FromType = NVPTX::PTXLdStInstCode::Signed;
|
||||
else if (ScalarVT.isFloatingPoint())
|
||||
FromType = NVPTX::PTXLdStInstCode::Float;
|
||||
FromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
|
||||
: NVPTX::PTXLdStInstCode::Float;
|
||||
else
|
||||
FromType = NVPTX::PTXLdStInstCode::Unsigned;
|
||||
|
||||
@ -987,6 +1132,16 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
|
||||
|
||||
EVT EltVT = N->getValueType(0);
|
||||
|
||||
// v8f16 is a special case. PTX doesn't have ld.v8.f16
|
||||
// instruction. Instead, we split the vector into v2f16 chunks and
|
||||
// load them with ld.v4.b32.
|
||||
if (EltVT == MVT::v2f16) {
|
||||
assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
|
||||
EltVT = MVT::i32;
|
||||
FromType = NVPTX::PTXLdStInstCode::Untyped;
|
||||
FromTypeWidth = 32;
|
||||
}
|
||||
|
||||
if (SelectDirectAddr(Op1, Addr)) {
|
||||
switch (N->getOpcode()) {
|
||||
default:
|
||||
@ -1007,6 +1162,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
|
||||
case MVT::i64:
|
||||
Opcode = NVPTX::LDV_i64_v2_avar;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::LDV_f16_v2_avar;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::LDV_f32_v2_avar;
|
||||
break;
|
||||
@ -1028,6 +1186,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
|
||||
case MVT::i32:
|
||||
Opcode = NVPTX::LDV_i32_v4_avar;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::LDV_f16_v4_avar;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::LDV_f32_v4_avar;
|
||||
break;
|
||||
@ -1060,6 +1221,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
|
||||
case MVT::i64:
|
||||
Opcode = NVPTX::LDV_i64_v2_asi;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::LDV_f16_v2_asi;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::LDV_f32_v2_asi;
|
||||
break;
|
||||
@ -1081,6 +1245,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
|
||||
case MVT::i32:
|
||||
Opcode = NVPTX::LDV_i32_v4_asi;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::LDV_f16_v4_asi;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::LDV_f32_v4_asi;
|
||||
break;
|
||||
@ -1114,6 +1281,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
|
||||
case MVT::i64:
|
||||
Opcode = NVPTX::LDV_i64_v2_ari_64;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::LDV_f16_v2_ari_64;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::LDV_f32_v2_ari_64;
|
||||
break;
|
||||
@ -1135,6 +1305,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
|
||||
case MVT::i32:
|
||||
Opcode = NVPTX::LDV_i32_v4_ari_64;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::LDV_f16_v4_ari_64;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::LDV_f32_v4_ari_64;
|
||||
break;
|
||||
@ -1161,6 +1334,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
|
||||
case MVT::i64:
|
||||
Opcode = NVPTX::LDV_i64_v2_ari;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::LDV_f16_v2_ari;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::LDV_f32_v2_ari;
|
||||
break;
|
||||
@ -1182,6 +1358,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
|
||||
case MVT::i32:
|
||||
Opcode = NVPTX::LDV_i32_v4_ari;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::LDV_f16_v4_ari;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::LDV_f32_v4_ari;
|
||||
break;
|
||||
@ -1216,6 +1395,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
|
||||
case MVT::i64:
|
||||
Opcode = NVPTX::LDV_i64_v2_areg_64;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::LDV_f16_v2_areg_64;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::LDV_f32_v2_areg_64;
|
||||
break;
|
||||
@ -1237,6 +1419,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
|
||||
case MVT::i32:
|
||||
Opcode = NVPTX::LDV_i32_v4_areg_64;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::LDV_f16_v4_areg_64;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::LDV_f32_v4_areg_64;
|
||||
break;
|
||||
@ -1263,6 +1448,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
|
||||
case MVT::i64:
|
||||
Opcode = NVPTX::LDV_i64_v2_areg;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::LDV_f16_v2_areg;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::LDV_f32_v2_areg;
|
||||
break;
|
||||
@ -1284,6 +1472,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
|
||||
case MVT::i32:
|
||||
Opcode = NVPTX::LDV_i32_v4_areg;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::LDV_f16_v4_areg;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::LDV_f32_v4_areg;
|
||||
break;
|
||||
@ -2151,21 +2342,18 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
|
||||
// Vector Setting
|
||||
MVT SimpleVT = StoreVT.getSimpleVT();
|
||||
unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
|
||||
if (SimpleVT.isVector()) {
|
||||
unsigned num = SimpleVT.getVectorNumElements();
|
||||
if (num == 2)
|
||||
vecType = NVPTX::PTXLdStInstCode::V2;
|
||||
else if (num == 4)
|
||||
vecType = NVPTX::PTXLdStInstCode::V4;
|
||||
else
|
||||
return false;
|
||||
}
|
||||
|
||||
// Type Setting: toType + toTypeWidth
|
||||
// - for integer type, always use 'u'
|
||||
//
|
||||
MVT ScalarVT = SimpleVT.getScalarType();
|
||||
unsigned toTypeWidth = ScalarVT.getSizeInBits();
|
||||
if (SimpleVT.isVector()) {
|
||||
assert(StoreVT == MVT::v2f16 && "Unexpected vector type");
|
||||
// v2f16 is stored using st.b32
|
||||
toTypeWidth = 32;
|
||||
}
|
||||
|
||||
unsigned int toType;
|
||||
if (ScalarVT.isFloatingPoint())
|
||||
// f16 uses .b16 as its storage type.
|
||||
@ -2200,6 +2388,9 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::ST_f16_avar;
|
||||
break;
|
||||
case MVT::v2f16:
|
||||
Opcode = NVPTX::ST_f16x2_avar;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::ST_f32_avar;
|
||||
break;
|
||||
@ -2232,6 +2423,9 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::ST_f16_asi;
|
||||
break;
|
||||
case MVT::v2f16:
|
||||
Opcode = NVPTX::ST_f16x2_asi;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::ST_f32_asi;
|
||||
break;
|
||||
@ -2265,6 +2459,9 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::ST_f16_ari_64;
|
||||
break;
|
||||
case MVT::v2f16:
|
||||
Opcode = NVPTX::ST_f16x2_ari_64;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::ST_f32_ari_64;
|
||||
break;
|
||||
@ -2291,6 +2488,9 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::ST_f16_ari;
|
||||
break;
|
||||
case MVT::v2f16:
|
||||
Opcode = NVPTX::ST_f16x2_ari;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::ST_f32_ari;
|
||||
break;
|
||||
@ -2324,6 +2524,9 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::ST_f16_areg_64;
|
||||
break;
|
||||
case MVT::v2f16:
|
||||
Opcode = NVPTX::ST_f16x2_areg_64;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::ST_f32_areg_64;
|
||||
break;
|
||||
@ -2350,6 +2553,9 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::ST_f16_areg;
|
||||
break;
|
||||
case MVT::v2f16:
|
||||
Opcode = NVPTX::ST_f16x2_areg;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::ST_f32_areg;
|
||||
break;
|
||||
@ -2411,7 +2617,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
|
||||
unsigned ToTypeWidth = ScalarVT.getSizeInBits();
|
||||
unsigned ToType;
|
||||
if (ScalarVT.isFloatingPoint())
|
||||
ToType = NVPTX::PTXLdStInstCode::Float;
|
||||
ToType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
|
||||
: NVPTX::PTXLdStInstCode::Float;
|
||||
else
|
||||
ToType = NVPTX::PTXLdStInstCode::Unsigned;
|
||||
|
||||
@ -2438,6 +2645,16 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// v8f16 is a special case. PTX doesn't have st.v8.f16
|
||||
// instruction. Instead, we split the vector into v2f16 chunks and
|
||||
// store them with st.v4.b32.
|
||||
if (EltVT == MVT::v2f16) {
|
||||
assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
|
||||
EltVT = MVT::i32;
|
||||
ToType = NVPTX::PTXLdStInstCode::Untyped;
|
||||
ToTypeWidth = 32;
|
||||
}
|
||||
|
||||
StOps.push_back(getI32Imm(IsVolatile, DL));
|
||||
StOps.push_back(getI32Imm(CodeAddrSpace, DL));
|
||||
StOps.push_back(getI32Imm(VecType, DL));
|
||||
@ -2464,6 +2681,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
|
||||
case MVT::i64:
|
||||
Opcode = NVPTX::STV_i64_v2_avar;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::STV_f16_v2_avar;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::STV_f32_v2_avar;
|
||||
break;
|
||||
@ -2513,6 +2733,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
|
||||
case MVT::i64:
|
||||
Opcode = NVPTX::STV_i64_v2_asi;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::STV_f16_v2_asi;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::STV_f32_v2_asi;
|
||||
break;
|
||||
@ -2534,6 +2757,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
|
||||
case MVT::i32:
|
||||
Opcode = NVPTX::STV_i32_v4_asi;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::STV_f16_v4_asi;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::STV_f32_v4_asi;
|
||||
break;
|
||||
@ -2564,6 +2790,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
|
||||
case MVT::i64:
|
||||
Opcode = NVPTX::STV_i64_v2_ari_64;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::STV_f16_v2_ari_64;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::STV_f32_v2_ari_64;
|
||||
break;
|
||||
@ -2585,6 +2814,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
|
||||
case MVT::i32:
|
||||
Opcode = NVPTX::STV_i32_v4_ari_64;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::STV_f16_v4_ari_64;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::STV_f32_v4_ari_64;
|
||||
break;
|
||||
@ -2611,6 +2843,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
|
||||
case MVT::i64:
|
||||
Opcode = NVPTX::STV_i64_v2_ari;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::STV_f16_v2_ari;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::STV_f32_v2_ari;
|
||||
break;
|
||||
@ -2632,6 +2867,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
|
||||
case MVT::i32:
|
||||
Opcode = NVPTX::STV_i32_v4_ari;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::STV_f16_v4_ari;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::STV_f32_v4_ari;
|
||||
break;
|
||||
@ -2662,6 +2900,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
|
||||
case MVT::i64:
|
||||
Opcode = NVPTX::STV_i64_v2_areg_64;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::STV_f16_v2_areg_64;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::STV_f32_v2_areg_64;
|
||||
break;
|
||||
@ -2683,6 +2924,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
|
||||
case MVT::i32:
|
||||
Opcode = NVPTX::STV_i32_v4_areg_64;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::STV_f16_v4_areg_64;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::STV_f32_v4_areg_64;
|
||||
break;
|
||||
@ -2709,6 +2953,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
|
||||
case MVT::i64:
|
||||
Opcode = NVPTX::STV_i64_v2_areg;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::STV_f16_v2_areg;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::STV_f32_v2_areg;
|
||||
break;
|
||||
@ -2730,6 +2977,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
|
||||
case MVT::i32:
|
||||
Opcode = NVPTX::STV_i32_v4_areg;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::STV_f16_v4_areg;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::STV_f32_v4_areg;
|
||||
break;
|
||||
@ -2804,6 +3054,9 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
|
||||
case MVT::f16:
|
||||
Opc = NVPTX::LoadParamMemF16;
|
||||
break;
|
||||
case MVT::v2f16:
|
||||
Opc = NVPTX::LoadParamMemF16x2;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opc = NVPTX::LoadParamMemF32;
|
||||
break;
|
||||
@ -2831,6 +3084,12 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
|
||||
case MVT::i64:
|
||||
Opc = NVPTX::LoadParamMemV2I64;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opc = NVPTX::LoadParamMemV2F16;
|
||||
break;
|
||||
case MVT::v2f16:
|
||||
Opc = NVPTX::LoadParamMemV2F16x2;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opc = NVPTX::LoadParamMemV2F32;
|
||||
break;
|
||||
@ -2855,6 +3114,12 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
|
||||
case MVT::i32:
|
||||
Opc = NVPTX::LoadParamMemV4I32;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opc = NVPTX::LoadParamMemV4F16;
|
||||
break;
|
||||
case MVT::v2f16:
|
||||
Opc = NVPTX::LoadParamMemV4F16x2;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opc = NVPTX::LoadParamMemV4F32;
|
||||
break;
|
||||
@ -2942,6 +3207,9 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::StoreRetvalF16;
|
||||
break;
|
||||
case MVT::v2f16:
|
||||
Opcode = NVPTX::StoreRetvalF16x2;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::StoreRetvalF32;
|
||||
break;
|
||||
@ -2969,6 +3237,12 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
|
||||
case MVT::i64:
|
||||
Opcode = NVPTX::StoreRetvalV2I64;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::StoreRetvalV2F16;
|
||||
break;
|
||||
case MVT::v2f16:
|
||||
Opcode = NVPTX::StoreRetvalV2F16x2;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::StoreRetvalV2F32;
|
||||
break;
|
||||
@ -2993,6 +3267,12 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
|
||||
case MVT::i32:
|
||||
Opcode = NVPTX::StoreRetvalV4I32;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::StoreRetvalV4F16;
|
||||
break;
|
||||
case MVT::v2f16:
|
||||
Opcode = NVPTX::StoreRetvalV4F16x2;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::StoreRetvalV4F32;
|
||||
break;
|
||||
@ -3000,8 +3280,7 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
|
||||
break;
|
||||
}
|
||||
|
||||
SDNode *Ret =
|
||||
CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops);
|
||||
SDNode *Ret = CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops);
|
||||
MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
|
||||
MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
|
||||
cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
|
||||
@ -3078,6 +3357,9 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::StoreParamF16;
|
||||
break;
|
||||
case MVT::v2f16:
|
||||
Opcode = NVPTX::StoreParamF16x2;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::StoreParamF32;
|
||||
break;
|
||||
@ -3105,6 +3387,12 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
|
||||
case MVT::i64:
|
||||
Opcode = NVPTX::StoreParamV2I64;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::StoreParamV2F16;
|
||||
break;
|
||||
case MVT::v2f16:
|
||||
Opcode = NVPTX::StoreParamV2F16x2;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::StoreParamV2F32;
|
||||
break;
|
||||
@ -3129,6 +3417,12 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
|
||||
case MVT::i32:
|
||||
Opcode = NVPTX::StoreParamV4I32;
|
||||
break;
|
||||
case MVT::f16:
|
||||
Opcode = NVPTX::StoreParamV4F16;
|
||||
break;
|
||||
case MVT::v2f16:
|
||||
Opcode = NVPTX::StoreParamV4F16x2;
|
||||
break;
|
||||
case MVT::f32:
|
||||
Opcode = NVPTX::StoreParamV4F32;
|
||||
break;
|
||||
|
@ -71,6 +71,8 @@ private:
|
||||
bool trySurfaceIntrinsic(SDNode *N);
|
||||
bool tryBFE(SDNode *N);
|
||||
bool tryConstantFP16(SDNode *N);
|
||||
bool SelectSETP_F16X2(SDNode *N);
|
||||
bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
|
||||
|
||||
inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
|
||||
return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
|
||||
|
@ -146,6 +146,9 @@ static bool IsPTXVectorType(MVT VT) {
|
||||
case MVT::v2i32:
|
||||
case MVT::v4i32:
|
||||
case MVT::v2i64:
|
||||
case MVT::v2f16:
|
||||
case MVT::v4f16:
|
||||
case MVT::v8f16: // <4 x f16x2>
|
||||
case MVT::v2f32:
|
||||
case MVT::v4f32:
|
||||
case MVT::v2f64:
|
||||
@ -170,13 +173,24 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
|
||||
for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
|
||||
EVT VT = TempVTs[i];
|
||||
uint64_t Off = TempOffsets[i];
|
||||
if (VT.isVector())
|
||||
for (unsigned j = 0, je = VT.getVectorNumElements(); j != je; ++j) {
|
||||
ValueVTs.push_back(VT.getVectorElementType());
|
||||
if (Offsets)
|
||||
Offsets->push_back(Off+j*VT.getVectorElementType().getStoreSize());
|
||||
// Split vectors into individual elements, except for v2f16, which
|
||||
// we will pass as a single scalar.
|
||||
if (VT.isVector()) {
|
||||
unsigned NumElts = VT.getVectorNumElements();
|
||||
EVT EltVT = VT.getVectorElementType();
|
||||
// Vectors with an even number of f16 elements will be passed to
|
||||
// us as an array of v2f16 elements. We must match this so we
|
||||
// stay in sync with Ins/Outs.
|
||||
if (EltVT == MVT::f16 && NumElts % 2 == 0) {
|
||||
EltVT = MVT::v2f16;
|
||||
NumElts /= 2;
|
||||
}
|
||||
else {
|
||||
for (unsigned j = 0; j != NumElts; ++j) {
|
||||
ValueVTs.push_back(EltVT);
|
||||
if (Offsets)
|
||||
Offsets->push_back(Off + j * EltVT.getStoreSize());
|
||||
}
|
||||
} else {
|
||||
ValueVTs.push_back(VT);
|
||||
if (Offsets)
|
||||
Offsets->push_back(Off);
|
||||
@ -331,6 +345,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
|
||||
else
|
||||
setSchedulingPreference(Sched::Source);
|
||||
|
||||
auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
|
||||
LegalizeAction NoF16Action) {
|
||||
setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
|
||||
};
|
||||
|
||||
addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
|
||||
addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
|
||||
addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
|
||||
@ -338,13 +357,20 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
|
||||
addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
|
||||
addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
|
||||
addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
|
||||
addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
|
||||
|
||||
setOperationAction(ISD::SETCC, MVT::f16,
|
||||
STI.allowFP16Math() ? Legal : Promote);
|
||||
// Conversion to/from FP16/FP16x2 is always legal.
|
||||
setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal);
|
||||
setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal);
|
||||
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
|
||||
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
|
||||
|
||||
setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
|
||||
setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
|
||||
|
||||
// Operations not directly supported by NVPTX.
|
||||
setOperationAction(ISD::SELECT_CC, MVT::f16,
|
||||
STI.allowFP16Math() ? Expand : Promote);
|
||||
setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
|
||||
setOperationAction(ISD::SELECT_CC, MVT::v2f16, Expand);
|
||||
setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
|
||||
setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
|
||||
setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
|
||||
@ -352,8 +378,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
|
||||
setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
|
||||
setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
|
||||
setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
|
||||
setOperationAction(ISD::BR_CC, MVT::f16,
|
||||
STI.allowFP16Math() ? Expand : Promote);
|
||||
setOperationAction(ISD::BR_CC, MVT::f16, Expand);
|
||||
setOperationAction(ISD::BR_CC, MVT::v2f16, Expand);
|
||||
setOperationAction(ISD::BR_CC, MVT::f32, Expand);
|
||||
setOperationAction(ISD::BR_CC, MVT::f64, Expand);
|
||||
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
|
||||
@ -493,58 +519,53 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
|
||||
setTargetDAGCombine(ISD::SREM);
|
||||
setTargetDAGCombine(ISD::UREM);
|
||||
|
||||
if (!STI.allowFP16Math()) {
|
||||
// Promote fp16 arithmetic if fp16 hardware isn't available or the
|
||||
// user passed --nvptx-no-fp16-math. The flag is useful because,
|
||||
// although sm_53+ GPUs have some sort of FP16 support in
|
||||
// hardware, only sm_53 and sm_60 have full implementation. Others
|
||||
// only have token amount of hardware and are likely to run faster
|
||||
// by using fp32 units instead.
|
||||
setOperationAction(ISD::FADD, MVT::f16, Promote);
|
||||
setOperationAction(ISD::FMUL, MVT::f16, Promote);
|
||||
setOperationAction(ISD::FSUB, MVT::f16, Promote);
|
||||
setOperationAction(ISD::FMA, MVT::f16, Promote);
|
||||
}
|
||||
// There's no neg.f16 instruction.
|
||||
setOperationAction(ISD::FNEG, MVT::f16, Expand);
|
||||
// setcc for f16x2 needs special handling to prevent legalizer's
|
||||
// attempt to scalarize it due to v2i1 not being legal.
|
||||
if (STI.allowFP16Math())
|
||||
setTargetDAGCombine(ISD::SETCC);
|
||||
|
||||
// Library functions. These default to Expand, but we have instructions
|
||||
// for them.
|
||||
setOperationAction(ISD::FCEIL, MVT::f16, Legal);
|
||||
setOperationAction(ISD::FCEIL, MVT::f32, Legal);
|
||||
setOperationAction(ISD::FCEIL, MVT::f64, Legal);
|
||||
setOperationAction(ISD::FFLOOR, MVT::f16, Legal);
|
||||
setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
|
||||
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
|
||||
setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
|
||||
setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
|
||||
setOperationAction(ISD::FRINT, MVT::f16, Legal);
|
||||
setOperationAction(ISD::FRINT, MVT::f32, Legal);
|
||||
setOperationAction(ISD::FRINT, MVT::f64, Legal);
|
||||
setOperationAction(ISD::FROUND, MVT::f16, Legal);
|
||||
setOperationAction(ISD::FROUND, MVT::f32, Legal);
|
||||
setOperationAction(ISD::FROUND, MVT::f64, Legal);
|
||||
setOperationAction(ISD::FTRUNC, MVT::f16, Legal);
|
||||
setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
|
||||
setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
|
||||
setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
|
||||
setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
|
||||
setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
|
||||
setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
|
||||
// Promote fp16 arithmetic if fp16 hardware isn't available or the
|
||||
// user passed --nvptx-no-fp16-math. The flag is useful because,
|
||||
// although sm_53+ GPUs have some sort of FP16 support in
|
||||
// hardware, only sm_53 and sm_60 have full implementation. Others
|
||||
// only have token amount of hardware and are likely to run faster
|
||||
// by using fp32 units instead.
|
||||
for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
|
||||
setFP16OperationAction(Op, MVT::f16, Legal, Promote);
|
||||
setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
|
||||
}
|
||||
|
||||
// There's no neg.f16 instruction. Expand to (0-x).
|
||||
setOperationAction(ISD::FNEG, MVT::f16, Expand);
|
||||
setOperationAction(ISD::FNEG, MVT::v2f16, Expand);
|
||||
|
||||
// (would be) Library functions.
|
||||
|
||||
// These map to conversion instructions for scalar FP types.
|
||||
for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
|
||||
ISD::FROUND, ISD::FTRUNC}) {
|
||||
setOperationAction(Op, MVT::f16, Legal);
|
||||
setOperationAction(Op, MVT::f32, Legal);
|
||||
setOperationAction(Op, MVT::f64, Legal);
|
||||
setOperationAction(Op, MVT::v2f16, Expand);
|
||||
}
|
||||
|
||||
// 'Expand' implements FCOPYSIGN without calling an external library.
|
||||
setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
|
||||
setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
|
||||
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
|
||||
setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
|
||||
|
||||
// FP16 does not support these nodes in hardware, but we can perform
|
||||
// these ops using single-precision hardware.
|
||||
setOperationAction(ISD::FDIV, MVT::f16, Promote);
|
||||
setOperationAction(ISD::FREM, MVT::f16, Promote);
|
||||
setOperationAction(ISD::FSQRT, MVT::f16, Promote);
|
||||
setOperationAction(ISD::FSIN, MVT::f16, Promote);
|
||||
setOperationAction(ISD::FCOS, MVT::f16, Promote);
|
||||
setOperationAction(ISD::FABS, MVT::f16, Promote);
|
||||
// These map to corresponding instructions for f32/f64. f16 must be
|
||||
// promoted to f32. v2f16 is expanded to f16, which is then promoted
|
||||
// to f32.
|
||||
for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS,
|
||||
ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM}) {
|
||||
setOperationAction(Op, MVT::f16, Promote);
|
||||
setOperationAction(Op, MVT::f32, Legal);
|
||||
setOperationAction(Op, MVT::f64, Legal);
|
||||
setOperationAction(Op, MVT::v2f16, Expand);
|
||||
}
|
||||
setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
|
||||
setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
|
||||
setOperationAction(ISD::FMINNAN, MVT::f16, Promote);
|
||||
@ -660,6 +681,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
return "NVPTXISD::FUN_SHFR_CLAMP";
|
||||
case NVPTXISD::IMAD:
|
||||
return "NVPTXISD::IMAD";
|
||||
case NVPTXISD::SETP_F16X2:
|
||||
return "NVPTXISD::SETP_F16X2";
|
||||
case NVPTXISD::Dummy:
|
||||
return "NVPTXISD::Dummy";
|
||||
case NVPTXISD::MUL_WIDE_SIGNED:
|
||||
@ -1158,7 +1181,8 @@ TargetLoweringBase::LegalizeTypeAction
|
||||
NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const {
|
||||
if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
|
||||
return TypeSplitVector;
|
||||
|
||||
if (VT == MVT::v2f16)
|
||||
return TypeLegal;
|
||||
return TargetLoweringBase::getPreferredVectorAction(VT);
|
||||
}
|
||||
|
||||
@ -1723,7 +1747,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
|
||||
bool ExtendIntegerRetVal =
|
||||
RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
|
||||
|
||||
for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
|
||||
for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
|
||||
bool needTruncate = false;
|
||||
EVT TheLoadType = VTs[i];
|
||||
EVT EltType = Ins[i].VT;
|
||||
@ -1765,11 +1789,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
|
||||
llvm_unreachable("Invalid vector info.");
|
||||
}
|
||||
|
||||
SDValue VectorOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
|
||||
DAG.getConstant(Offsets[VecIdx], dl, MVT::i32),
|
||||
InFlag};
|
||||
SDValue LoadOperands[] = {
|
||||
Chain, DAG.getConstant(1, dl, MVT::i32),
|
||||
DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
|
||||
SDValue RetVal = DAG.getMemIntrinsicNode(
|
||||
Op, dl, DAG.getVTList(LoadVTs), VectorOps, TheLoadType,
|
||||
Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
|
||||
MachinePointerInfo(), EltAlign);
|
||||
|
||||
for (unsigned j = 0; j < NumElts; ++j) {
|
||||
@ -1823,6 +1847,55 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
|
||||
return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
|
||||
}
|
||||
|
||||
// We can init constant f16x2 with a single .b32 move. Normally it
|
||||
// would get lowered as two constant loads and vector-packing move.
|
||||
// mov.b16 %h1, 0x4000;
|
||||
// mov.b16 %h2, 0x3C00;
|
||||
// mov.b32 %hh2, {%h2, %h1};
|
||||
// Instead we want just a constant move:
|
||||
// mov.b32 %hh2, 0x40003C00
|
||||
//
|
||||
// This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
|
||||
// generates good SASS in both cases.
|
||||
SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
//return Op;
|
||||
if (!(Op->getValueType(0) == MVT::v2f16 &&
|
||||
isa<ConstantFPSDNode>(Op->getOperand(0)) &&
|
||||
isa<ConstantFPSDNode>(Op->getOperand(1))))
|
||||
return Op;
|
||||
|
||||
APInt E0 =
|
||||
cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt();
|
||||
APInt E1 =
|
||||
cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt();
|
||||
SDValue Const =
|
||||
DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
|
||||
return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const);
|
||||
}
|
||||
|
||||
SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
SDValue Index = Op->getOperand(1);
|
||||
// Constant index will be matched by tablegen.
|
||||
if (isa<ConstantSDNode>(Index.getNode()))
|
||||
return Op;
|
||||
|
||||
// Extract individual elements and select one of them.
|
||||
SDValue Vector = Op->getOperand(0);
|
||||
EVT VectorVT = Vector.getValueType();
|
||||
assert(VectorVT == MVT::v2f16 && "Unexpected vector type.");
|
||||
EVT EltVT = VectorVT.getVectorElementType();
|
||||
|
||||
SDLoc dl(Op.getNode());
|
||||
SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
|
||||
DAG.getIntPtrConstant(0, dl));
|
||||
SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
|
||||
DAG.getIntPtrConstant(1, dl));
|
||||
return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
|
||||
ISD::CondCode::SETEQ);
|
||||
}
|
||||
|
||||
/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
|
||||
/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
|
||||
/// amount, or
|
||||
@ -1956,8 +2029,11 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
||||
case ISD::INTRINSIC_W_CHAIN:
|
||||
return Op;
|
||||
case ISD::BUILD_VECTOR:
|
||||
return LowerBUILD_VECTOR(Op, DAG);
|
||||
case ISD::EXTRACT_SUBVECTOR:
|
||||
return Op;
|
||||
case ISD::EXTRACT_VECTOR_ELT:
|
||||
return LowerEXTRACT_VECTOR_ELT(Op, DAG);
|
||||
case ISD::CONCAT_VECTORS:
|
||||
return LowerCONCAT_VECTORS(Op, DAG);
|
||||
case ISD::STORE:
|
||||
@ -2054,12 +2130,15 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
|
||||
case MVT::v2i16:
|
||||
case MVT::v2i32:
|
||||
case MVT::v2i64:
|
||||
case MVT::v2f16:
|
||||
case MVT::v2f32:
|
||||
case MVT::v2f64:
|
||||
case MVT::v4i8:
|
||||
case MVT::v4i16:
|
||||
case MVT::v4i32:
|
||||
case MVT::v4f16:
|
||||
case MVT::v4f32:
|
||||
case MVT::v8f16: // <4 x f16x2>
|
||||
// This is a "native" vector type
|
||||
break;
|
||||
}
|
||||
@ -2090,6 +2169,7 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
|
||||
if (EltVT.getSizeInBits() < 16)
|
||||
NeedExt = true;
|
||||
|
||||
bool StoreF16x2 = false;
|
||||
switch (NumElts) {
|
||||
default:
|
||||
return SDValue();
|
||||
@ -2099,6 +2179,14 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
|
||||
case 4:
|
||||
Opcode = NVPTXISD::StoreV4;
|
||||
break;
|
||||
case 8:
|
||||
// v8f16 is a special case. PTX doesn't have st.v8.f16
|
||||
// instruction. Instead, we split the vector into v2f16 chunks and
|
||||
// store them with st.v4.b32.
|
||||
assert(EltVT == MVT::f16 && "Wrong type for the vector.");
|
||||
Opcode = NVPTXISD::StoreV4;
|
||||
StoreF16x2 = true;
|
||||
break;
|
||||
}
|
||||
|
||||
SmallVector<SDValue, 8> Ops;
|
||||
@ -2106,23 +2194,36 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
|
||||
// First is the chain
|
||||
Ops.push_back(N->getOperand(0));
|
||||
|
||||
// Then the split values
|
||||
for (unsigned i = 0; i < NumElts; ++i) {
|
||||
SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
|
||||
DAG.getIntPtrConstant(i, DL));
|
||||
if (NeedExt)
|
||||
ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
|
||||
Ops.push_back(ExtVal);
|
||||
if (StoreF16x2) {
|
||||
// Combine f16,f16 -> v2f16
|
||||
NumElts /= 2;
|
||||
for (unsigned i = 0; i < NumElts; ++i) {
|
||||
SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
|
||||
DAG.getIntPtrConstant(i * 2, DL));
|
||||
SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
|
||||
DAG.getIntPtrConstant(i * 2 + 1, DL));
|
||||
SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1);
|
||||
Ops.push_back(V2);
|
||||
}
|
||||
} else {
|
||||
// Then the split values
|
||||
for (unsigned i = 0; i < NumElts; ++i) {
|
||||
SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
|
||||
DAG.getIntPtrConstant(i, DL));
|
||||
if (NeedExt)
|
||||
ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
|
||||
Ops.push_back(ExtVal);
|
||||
}
|
||||
}
|
||||
|
||||
// Then any remaining arguments
|
||||
Ops.append(N->op_begin() + 2, N->op_end());
|
||||
|
||||
SDValue NewSt = DAG.getMemIntrinsicNode(
|
||||
Opcode, DL, DAG.getVTList(MVT::Other), Ops,
|
||||
MemSD->getMemoryVT(), MemSD->getMemOperand());
|
||||
SDValue NewSt =
|
||||
DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
|
||||
MemSD->getMemoryVT(), MemSD->getMemOperand());
|
||||
|
||||
//return DCI.CombineTo(N, NewSt, true);
|
||||
// return DCI.CombineTo(N, NewSt, true);
|
||||
return NewSt;
|
||||
}
|
||||
|
||||
@ -2282,7 +2383,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
|
||||
SmallVector<EVT, 16> VTs;
|
||||
SmallVector<uint64_t, 16> Offsets;
|
||||
ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
|
||||
assert(VTs.size() > 0 && "empty aggregate type not expected");
|
||||
assert(VTs.size() > 0 && "Unexpected empty type.");
|
||||
auto VectorInfo =
|
||||
VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty));
|
||||
|
||||
@ -2299,7 +2400,15 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
|
||||
unsigned NumElts = parti - VecIdx + 1;
|
||||
EVT EltVT = VTs[parti];
|
||||
// i1 is loaded/stored as i8.
|
||||
EVT LoadVT = EltVT == MVT::i1 ? MVT::i8 : EltVT;
|
||||
EVT LoadVT = EltVT;
|
||||
if (EltVT == MVT::i1)
|
||||
LoadVT = MVT::i8;
|
||||
else if (EltVT == MVT::v2f16)
|
||||
// getLoad needs a vector type, but it can't handle
|
||||
// vectors which contain v2f16 elements. So we must load
|
||||
// using i32 here and then bitcast back.
|
||||
LoadVT = MVT::i32;
|
||||
|
||||
EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
|
||||
SDValue VecAddr =
|
||||
DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
|
||||
@ -2319,15 +2428,20 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
|
||||
// We've loaded i1 as an i8 and now must truncate it back to i1
|
||||
if (EltVT == MVT::i1)
|
||||
Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
|
||||
// Extend the element if necesary (e.g an i8 is loaded
|
||||
// v2f16 was loaded as an i32. Now we must bitcast it back.
|
||||
else if (EltVT == MVT::v2f16)
|
||||
Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
|
||||
// Extend the element if necesary (e.g. an i8 is loaded
|
||||
// into an i16 register)
|
||||
if (Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) {
|
||||
if (Ins[InsIdx].VT.isInteger() &&
|
||||
Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) {
|
||||
unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
|
||||
: ISD::ZERO_EXTEND;
|
||||
Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
|
||||
}
|
||||
InVals.push_back(Elt);
|
||||
}
|
||||
|
||||
// Reset vector tracking state.
|
||||
VecIdx = -1;
|
||||
}
|
||||
@ -2399,7 +2513,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
|
||||
RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
|
||||
|
||||
SmallVector<SDValue, 6> StoreOperands;
|
||||
for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
|
||||
for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
|
||||
// New load/store. Record chain and offset operands.
|
||||
if (VectorInfo[i] & PVF_FIRST) {
|
||||
assert(StoreOperands.empty() && "Orphaned operand list.");
|
||||
@ -4168,6 +4282,27 @@ static SDValue PerformSHLCombine(SDNode *N,
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
static SDValue PerformSETCCCombine(SDNode *N,
|
||||
TargetLowering::DAGCombinerInfo &DCI) {
|
||||
EVT CCType = N->getValueType(0);
|
||||
SDValue A = N->getOperand(0);
|
||||
SDValue B = N->getOperand(1);
|
||||
|
||||
if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16)
|
||||
return SDValue();
|
||||
|
||||
SDLoc DL(N);
|
||||
// setp.f16x2 returns two scalar predicates, which we need to
|
||||
// convert back to v2i1. The returned result will be scalarized by
|
||||
// the legalizer, but the comparison will remain a single vector
|
||||
// instruction.
|
||||
SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL,
|
||||
DCI.DAG.getVTList(MVT::i1, MVT::i1),
|
||||
{A, B, N->getOperand(2)});
|
||||
return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
|
||||
CCNode.getValue(1));
|
||||
}
|
||||
|
||||
SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
|
||||
DAGCombinerInfo &DCI) const {
|
||||
CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel();
|
||||
@ -4185,6 +4320,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
|
||||
case ISD::UREM:
|
||||
case ISD::SREM:
|
||||
return PerformREMCombine(N, DCI, OptLevel);
|
||||
case ISD::SETCC:
|
||||
return PerformSETCCCombine(N, DCI);
|
||||
}
|
||||
return SDValue();
|
||||
}
|
||||
@ -4208,12 +4345,15 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
|
||||
case MVT::v2i16:
|
||||
case MVT::v2i32:
|
||||
case MVT::v2i64:
|
||||
case MVT::v2f16:
|
||||
case MVT::v2f32:
|
||||
case MVT::v2f64:
|
||||
case MVT::v4i8:
|
||||
case MVT::v4i16:
|
||||
case MVT::v4i32:
|
||||
case MVT::v4f16:
|
||||
case MVT::v4f32:
|
||||
case MVT::v8f16: // <4 x f16x2>
|
||||
// This is a "native" vector type
|
||||
break;
|
||||
}
|
||||
@ -4247,6 +4387,7 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
|
||||
|
||||
unsigned Opcode = 0;
|
||||
SDVTList LdResVTs;
|
||||
bool LoadF16x2 = false;
|
||||
|
||||
switch (NumElts) {
|
||||
default:
|
||||
@ -4261,6 +4402,18 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
|
||||
LdResVTs = DAG.getVTList(ListVTs);
|
||||
break;
|
||||
}
|
||||
case 8: {
|
||||
// v8f16 is a special case. PTX doesn't have ld.v8.f16
|
||||
// instruction. Instead, we split the vector into v2f16 chunks and
|
||||
// load them with ld.v4.b32.
|
||||
assert(EltVT == MVT::f16 && "Unsupported v8 vector type.");
|
||||
LoadF16x2 = true;
|
||||
Opcode = NVPTXISD::LoadV4;
|
||||
EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16,
|
||||
MVT::Other};
|
||||
LdResVTs = DAG.getVTList(ListVTs);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Copy regular operands
|
||||
@ -4274,13 +4427,26 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
|
||||
LD->getMemoryVT(),
|
||||
LD->getMemOperand());
|
||||
|
||||
SmallVector<SDValue, 4> ScalarRes;
|
||||
|
||||
for (unsigned i = 0; i < NumElts; ++i) {
|
||||
SDValue Res = NewLD.getValue(i);
|
||||
if (NeedTrunc)
|
||||
Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
|
||||
ScalarRes.push_back(Res);
|
||||
SmallVector<SDValue, 8> ScalarRes;
|
||||
if (LoadF16x2) {
|
||||
// Split v2f16 subvectors back into individual elements.
|
||||
NumElts /= 2;
|
||||
for (unsigned i = 0; i < NumElts; ++i) {
|
||||
SDValue SubVector = NewLD.getValue(i);
|
||||
SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
|
||||
DAG.getIntPtrConstant(0, DL));
|
||||
SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
|
||||
DAG.getIntPtrConstant(1, DL));
|
||||
ScalarRes.push_back(E0);
|
||||
ScalarRes.push_back(E1);
|
||||
}
|
||||
} else {
|
||||
for (unsigned i = 0; i < NumElts; ++i) {
|
||||
SDValue Res = NewLD.getValue(i);
|
||||
if (NeedTrunc)
|
||||
Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
|
||||
ScalarRes.push_back(Res);
|
||||
}
|
||||
}
|
||||
|
||||
SDValue LoadChain = NewLD.getValue(NumElts);
|
||||
|
@ -56,6 +56,7 @@ enum NodeType : unsigned {
|
||||
MUL_WIDE_SIGNED,
|
||||
MUL_WIDE_UNSIGNED,
|
||||
IMAD,
|
||||
SETP_F16X2,
|
||||
Dummy,
|
||||
|
||||
LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
|
||||
@ -549,14 +550,15 @@ private:
|
||||
const NVPTXSubtarget &STI; // cache the subtarget here
|
||||
SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
|
||||
|
||||
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
||||
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
||||
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerSTOREf16(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
||||
SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
@ -55,6 +55,8 @@ void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
|
||||
} else if (DestRC == &NVPTX::Float16RegsRegClass) {
|
||||
Op = (SrcRC == &NVPTX::Float16RegsRegClass ? NVPTX::FMOV16rr
|
||||
: NVPTX::BITCONVERT_16_I2F);
|
||||
} else if (DestRC == &NVPTX::Float16x2RegsRegClass) {
|
||||
Op = NVPTX::IMOV32rr;
|
||||
} else if (DestRC == &NVPTX::Float32RegsRegClass) {
|
||||
Op = (SrcRC == &NVPTX::Float32RegsRegClass ? NVPTX::FMOV32rr
|
||||
: NVPTX::BITCONVERT_32_I2F);
|
||||
|
@ -102,6 +102,9 @@ def CmpNAN_FTZ : PatLeaf<(i32 0x111)>;
|
||||
def CmpMode : Operand<i32> {
|
||||
let PrintMethod = "printCmpMode";
|
||||
}
|
||||
def VecElement : Operand<i32> {
|
||||
let PrintMethod = "printVecElement";
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// NVPTX Instruction Predicate Definitions
|
||||
@ -305,6 +308,19 @@ multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
|
||||
[(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
|
||||
Requires<[useFP16Math, allowFMA]>;
|
||||
|
||||
def f16x2rr_ftz :
|
||||
NVPTXInst<(outs Float16x2Regs:$dst),
|
||||
(ins Float16x2Regs:$a, Float16x2Regs:$b),
|
||||
!strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
|
||||
[(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
|
||||
Requires<[useFP16Math, allowFMA, doF32FTZ]>;
|
||||
def f16x2rr :
|
||||
NVPTXInst<(outs Float16x2Regs:$dst),
|
||||
(ins Float16x2Regs:$a, Float16x2Regs:$b),
|
||||
!strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
|
||||
[(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
|
||||
Requires<[useFP16Math, allowFMA]>;
|
||||
|
||||
// These have strange names so we don't perturb existing mir tests.
|
||||
def _rnf64rr :
|
||||
NVPTXInst<(outs Float64Regs:$dst),
|
||||
@ -354,6 +370,18 @@ multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
|
||||
!strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),
|
||||
[(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
|
||||
Requires<[useFP16Math, noFMA]>;
|
||||
def _rnf16x2rr_ftz :
|
||||
NVPTXInst<(outs Float16x2Regs:$dst),
|
||||
(ins Float16x2Regs:$a, Float16x2Regs:$b),
|
||||
!strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),
|
||||
[(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
|
||||
Requires<[useFP16Math, noFMA, doF32FTZ]>;
|
||||
def _rnf16x2rr :
|
||||
NVPTXInst<(outs Float16x2Regs:$dst),
|
||||
(ins Float16x2Regs:$a, Float16x2Regs:$b),
|
||||
!strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),
|
||||
[(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
|
||||
Requires<[useFP16Math, noFMA]>;
|
||||
}
|
||||
|
||||
// Template for operations which take two f32 or f64 operands. Provides three
|
||||
@ -991,15 +1019,17 @@ multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred>
|
||||
Requires<[Pred]>;
|
||||
}
|
||||
|
||||
multiclass FMA_F16<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
|
||||
multiclass FMA_F16<string OpcStr, RegisterClass RC, Predicate Pred> {
|
||||
def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
|
||||
!strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
|
||||
[(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
|
||||
Requires<[useFP16Math, Pred]>;
|
||||
}
|
||||
|
||||
defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", Float16Regs, f16imm, doF32FTZ>;
|
||||
defm FMA16 : FMA_F16<"fma.rn.f16", Float16Regs, f16imm, true>;
|
||||
defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", Float16Regs, doF32FTZ>;
|
||||
defm FMA16 : FMA_F16<"fma.rn.f16", Float16Regs, true>;
|
||||
defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", Float16x2Regs, doF32FTZ>;
|
||||
defm FMA16x2 : FMA_F16<"fma.rn.f16x2", Float16x2Regs, true>;
|
||||
defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
|
||||
defm FMA32 : FMA<"fma.rn.f32", Float32Regs, f32imm, true>;
|
||||
defm FMA64 : FMA<"fma.rn.f64", Float64Regs, f64imm, true>;
|
||||
@ -1390,9 +1420,17 @@ defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
|
||||
def SETP_f16rr :
|
||||
NVPTXInst<(outs Int1Regs:$dst),
|
||||
(ins Float16Regs:$a, Float16Regs:$b, CmpMode:$cmp),
|
||||
"setp${cmp:base}${cmp:ftz}.f16 $dst, $a, $b;",
|
||||
"setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;",
|
||||
[]>, Requires<[useFP16Math]>;
|
||||
|
||||
def SETP_f16x2rr :
|
||||
NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
|
||||
(ins Float16x2Regs:$a, Float16x2Regs:$b, CmpMode:$cmp),
|
||||
"setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;",
|
||||
[]>,
|
||||
Requires<[useFP16Math]>;
|
||||
|
||||
|
||||
// FIXME: This doesn't appear to be correct. The "set" mnemonic has the form
|
||||
// "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
|
||||
// reg, either u32, s32, or f32. Anyway these aren't used at the moment.
|
||||
@ -1488,6 +1526,13 @@ defm SELP_f16 : SELP_PATTERN<"b16", Float16Regs, f16imm, fpimm>;
|
||||
defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
|
||||
defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;
|
||||
|
||||
def SELP_f16x2rr :
|
||||
NVPTXInst<(outs Float16x2Regs:$dst),
|
||||
(ins Float16x2Regs:$a, Float16x2Regs:$b, Int1Regs:$p),
|
||||
"selp.b32 \t$dst, $a, $b, $p;",
|
||||
[(set Float16x2Regs:$dst,
|
||||
(select Int1Regs:$p, Float16x2Regs:$a, Float16x2Regs:$b))]>;
|
||||
|
||||
//-----------------------------------
|
||||
// Data Movement (Load / Store, Move)
|
||||
//-----------------------------------
|
||||
@ -2061,10 +2106,15 @@ def LoadParamMemV4I32 : LoadParamV4MemInst<Int32Regs, ".b32">;
|
||||
def LoadParamMemV4I16 : LoadParamV4MemInst<Int16Regs, ".b16">;
|
||||
def LoadParamMemV4I8 : LoadParamV4MemInst<Int16Regs, ".b8">;
|
||||
def LoadParamMemF16 : LoadParamMemInst<Float16Regs, ".b16">;
|
||||
def LoadParamMemF16x2 : LoadParamMemInst<Float16x2Regs, ".b32">;
|
||||
def LoadParamMemF32 : LoadParamMemInst<Float32Regs, ".f32">;
|
||||
def LoadParamMemF64 : LoadParamMemInst<Float64Regs, ".f64">;
|
||||
def LoadParamMemV2F16 : LoadParamV2MemInst<Float16Regs, ".b16">;
|
||||
def LoadParamMemV2F16x2: LoadParamV2MemInst<Float16x2Regs, ".b32">;
|
||||
def LoadParamMemV2F32 : LoadParamV2MemInst<Float32Regs, ".f32">;
|
||||
def LoadParamMemV2F64 : LoadParamV2MemInst<Float64Regs, ".f64">;
|
||||
def LoadParamMemV4F16 : LoadParamV4MemInst<Float16Regs, ".b16">;
|
||||
def LoadParamMemV4F16x2: LoadParamV4MemInst<Float16x2Regs, ".b32">;
|
||||
def LoadParamMemV4F32 : LoadParamV4MemInst<Float32Regs, ".f32">;
|
||||
|
||||
def StoreParamI64 : StoreParamInst<Int64Regs, ".b64">;
|
||||
@ -2082,10 +2132,15 @@ def StoreParamV4I16 : StoreParamV4Inst<Int16Regs, ".b16">;
|
||||
def StoreParamV4I8 : StoreParamV4Inst<Int16Regs, ".b8">;
|
||||
|
||||
def StoreParamF16 : StoreParamInst<Float16Regs, ".b16">;
|
||||
def StoreParamF16x2 : StoreParamInst<Float16x2Regs, ".b32">;
|
||||
def StoreParamF32 : StoreParamInst<Float32Regs, ".f32">;
|
||||
def StoreParamF64 : StoreParamInst<Float64Regs, ".f64">;
|
||||
def StoreParamV2F16 : StoreParamV2Inst<Float16Regs, ".b16">;
|
||||
def StoreParamV2F16x2 : StoreParamV2Inst<Float16x2Regs, ".b32">;
|
||||
def StoreParamV2F32 : StoreParamV2Inst<Float32Regs, ".f32">;
|
||||
def StoreParamV2F64 : StoreParamV2Inst<Float64Regs, ".f64">;
|
||||
def StoreParamV4F16 : StoreParamV4Inst<Float16Regs, ".b16">;
|
||||
def StoreParamV4F16x2 : StoreParamV4Inst<Float16x2Regs, ".b32">;
|
||||
def StoreParamV4F32 : StoreParamV4Inst<Float32Regs, ".f32">;
|
||||
|
||||
def StoreRetvalI64 : StoreRetvalInst<Int64Regs, ".b64">;
|
||||
@ -2103,9 +2158,14 @@ def StoreRetvalV4I8 : StoreRetvalV4Inst<Int16Regs, ".b8">;
|
||||
def StoreRetvalF64 : StoreRetvalInst<Float64Regs, ".f64">;
|
||||
def StoreRetvalF32 : StoreRetvalInst<Float32Regs, ".f32">;
|
||||
def StoreRetvalF16 : StoreRetvalInst<Float16Regs, ".b16">;
|
||||
def StoreRetvalF16x2 : StoreRetvalInst<Float16x2Regs, ".b32">;
|
||||
def StoreRetvalV2F64 : StoreRetvalV2Inst<Float64Regs, ".f64">;
|
||||
def StoreRetvalV2F32 : StoreRetvalV2Inst<Float32Regs, ".f32">;
|
||||
def StoreRetvalV2F16 : StoreRetvalV2Inst<Float16Regs, ".b16">;
|
||||
def StoreRetvalV2F16x2: StoreRetvalV2Inst<Float16x2Regs, ".b32">;
|
||||
def StoreRetvalV4F32 : StoreRetvalV4Inst<Float32Regs, ".f32">;
|
||||
def StoreRetvalV4F16 : StoreRetvalV4Inst<Float16Regs, ".b16">;
|
||||
def StoreRetvalV4F16x2: StoreRetvalV4Inst<Float16x2Regs, ".b32">;
|
||||
|
||||
def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
|
||||
def CallArgEndInst1 : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
|
||||
@ -2252,6 +2312,7 @@ let mayLoad=1, hasSideEffects=0 in {
|
||||
defm LD_i32 : LD<Int32Regs>;
|
||||
defm LD_i64 : LD<Int64Regs>;
|
||||
defm LD_f16 : LD<Float16Regs>;
|
||||
defm LD_f16x2 : LD<Float16x2Regs>;
|
||||
defm LD_f32 : LD<Float32Regs>;
|
||||
defm LD_f64 : LD<Float64Regs>;
|
||||
}
|
||||
@ -2301,6 +2362,7 @@ let mayStore=1, hasSideEffects=0 in {
|
||||
defm ST_i32 : ST<Int32Regs>;
|
||||
defm ST_i64 : ST<Int64Regs>;
|
||||
defm ST_f16 : ST<Float16Regs>;
|
||||
defm ST_f16x2 : ST<Float16x2Regs>;
|
||||
defm ST_f32 : ST<Float32Regs>;
|
||||
defm ST_f64 : ST<Float64Regs>;
|
||||
}
|
||||
@ -2387,6 +2449,7 @@ let mayLoad=1, hasSideEffects=0 in {
|
||||
defm LDV_i16 : LD_VEC<Int16Regs>;
|
||||
defm LDV_i32 : LD_VEC<Int32Regs>;
|
||||
defm LDV_i64 : LD_VEC<Int64Regs>;
|
||||
defm LDV_f16 : LD_VEC<Float16Regs>;
|
||||
defm LDV_f32 : LD_VEC<Float32Regs>;
|
||||
defm LDV_f64 : LD_VEC<Float64Regs>;
|
||||
}
|
||||
@ -2480,17 +2543,18 @@ let mayStore=1, hasSideEffects=0 in {
|
||||
defm STV_i16 : ST_VEC<Int16Regs>;
|
||||
defm STV_i32 : ST_VEC<Int32Regs>;
|
||||
defm STV_i64 : ST_VEC<Int64Regs>;
|
||||
defm STV_f16 : ST_VEC<Float16Regs>;
|
||||
defm STV_f16x2 : ST_VEC<Float16x2Regs>;
|
||||
defm STV_f32 : ST_VEC<Float32Regs>;
|
||||
defm STV_f64 : ST_VEC<Float64Regs>;
|
||||
}
|
||||
|
||||
|
||||
//---- Conversion ----
|
||||
|
||||
class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,
|
||||
NVPTXRegClass regclassOut> :
|
||||
NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
|
||||
!strconcat("mov.b", !strconcat(SzStr, " \t $d, $a;")),
|
||||
!strconcat("mov.b", !strconcat(SzStr, " \t$d, $a;")),
|
||||
[(set regclassOut:$d, (bitconvert regclassIn:$a))]>;
|
||||
|
||||
def BITCONVERT_16_I2F : F_BITCONVERT<"16", Int16Regs, Float16Regs>;
|
||||
@ -2499,6 +2563,8 @@ def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>;
|
||||
def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;
|
||||
def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
|
||||
def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
|
||||
def BITCONVERT_32_I2F16x2 : F_BITCONVERT<"32", Int32Regs, Float16x2Regs>;
|
||||
def BITCONVERT_32_F16x22I : F_BITCONVERT<"32", Float16x2Regs, Int32Regs>;
|
||||
|
||||
// NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
|
||||
// we cannot specify floating-point literals in isel patterns. Therefore, we
|
||||
@ -2741,6 +2807,9 @@ def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
|
||||
def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
|
||||
(SELP_b64rr Int64Regs:$a, Int64Regs:$b,
|
||||
(SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
|
||||
def : Pat<(select Int32Regs:$pred, Float16Regs:$a, Float16Regs:$b),
|
||||
(SELP_f16rr Float16Regs:$a, Float16Regs:$b,
|
||||
(SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
|
||||
def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
|
||||
(SELP_f32rr Float32Regs:$a, Float32Regs:$b,
|
||||
(SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
|
||||
@ -2779,6 +2848,49 @@ let hasSideEffects = 0 in {
|
||||
def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
|
||||
(ins Float64Regs:$s),
|
||||
"mov.b64 \t{{$d1, $d2}}, $s;", []>;
|
||||
|
||||
}
|
||||
|
||||
let hasSideEffects = 0 in {
|
||||
// Extract element of f16x2 register. PTX does not provide any way
|
||||
// to access elements of f16x2 vector directly, so we need to
|
||||
// extract it using a temporary register.
|
||||
def F16x2toF16_0 : NVPTXInst<(outs Float16Regs:$dst),
|
||||
(ins Float16x2Regs:$src),
|
||||
"{{ .reg .b16 \t%tmp_hi;\n\t"
|
||||
" mov.b32 \t{$dst, %tmp_hi}, $src; }}",
|
||||
[(set Float16Regs:$dst,
|
||||
(extractelt (v2f16 Float16x2Regs:$src), 0))]>;
|
||||
def F16x2toF16_1 : NVPTXInst<(outs Float16Regs:$dst),
|
||||
(ins Float16x2Regs:$src),
|
||||
"{{ .reg .b16 \t%tmp_lo;\n\t"
|
||||
" mov.b32 \t{%tmp_lo, $dst}, $src; }}",
|
||||
[(set Float16Regs:$dst,
|
||||
(extractelt (v2f16 Float16x2Regs:$src), 1))]>;
|
||||
|
||||
// Coalesce two f16 registers into f16x2
|
||||
def BuildF16x2 : NVPTXInst<(outs Float16x2Regs:$dst),
|
||||
(ins Float16Regs:$a, Float16Regs:$b),
|
||||
"mov.b32 \t$dst, {{$a, $b}};",
|
||||
[(set Float16x2Regs:$dst,
|
||||
(build_vector (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>;
|
||||
|
||||
// Directly initializing underlying the b32 register is one less SASS
|
||||
// instruction than than vector-packing move.
|
||||
def BuildF16x2i : NVPTXInst<(outs Float16x2Regs:$dst), (ins i32imm:$src),
|
||||
"mov.b32 \t$dst, $src;",
|
||||
[]>;
|
||||
|
||||
// Split f16x2 into two f16 registers.
|
||||
def SplitF16x2 : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
|
||||
(ins Float16x2Regs:$src),
|
||||
"mov.b32 \t{{$lo, $hi}}, $src;",
|
||||
[]>;
|
||||
// Split an i32 into two f16
|
||||
def SplitI32toF16x2 : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
|
||||
(ins Int32Regs:$src),
|
||||
"mov.b32 \t{{$lo, $hi}}, $src;",
|
||||
[]>;
|
||||
}
|
||||
|
||||
// Count leading zeros
|
||||
|
@ -1606,6 +1606,10 @@ defm INT_PTX_LDG_GLOBAL_i32
|
||||
: LDG_G<"u32 \t$result, [$src];", Int32Regs>;
|
||||
defm INT_PTX_LDG_GLOBAL_i64
|
||||
: LDG_G<"u64 \t$result, [$src];", Int64Regs>;
|
||||
defm INT_PTX_LDG_GLOBAL_f16
|
||||
: LDG_G<"b16 \t$result, [$src];", Float16Regs>;
|
||||
defm INT_PTX_LDG_GLOBAL_f16x2
|
||||
: LDG_G<"b32 \t$result, [$src];", Float16x2Regs>;
|
||||
defm INT_PTX_LDG_GLOBAL_f32
|
||||
: LDG_G<"f32 \t$result, [$src];", Float32Regs>;
|
||||
defm INT_PTX_LDG_GLOBAL_f64
|
||||
@ -1661,6 +1665,8 @@ defm INT_PTX_LDG_G_v2i16_ELE
|
||||
: VLDG_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
|
||||
defm INT_PTX_LDG_G_v2i32_ELE
|
||||
: VLDG_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
|
||||
defm INT_PTX_LDG_G_v4f16_ELE
|
||||
: VLDG_G_ELE_V2<"v2.b32 \t{{$dst1, $dst2}}, [$src];", Float16x2Regs>;
|
||||
defm INT_PTX_LDG_G_v2f32_ELE
|
||||
: VLDG_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
|
||||
defm INT_PTX_LDG_G_v2i64_ELE
|
||||
@ -1673,6 +1679,8 @@ defm INT_PTX_LDG_G_v4i16_ELE
|
||||
: VLDG_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
|
||||
defm INT_PTX_LDG_G_v4i32_ELE
|
||||
: VLDG_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int32Regs>;
|
||||
defm INT_PTX_LDG_G_v8f16_ELE
|
||||
: VLDG_G_ELE_V4<"v4.b32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float16x2Regs>;
|
||||
defm INT_PTX_LDG_G_v4f32_ELE
|
||||
: VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>;
|
||||
|
||||
|
@ -35,6 +35,8 @@ std::string getNVPTXRegClassName(TargetRegisterClass const *RC) {
|
||||
// accepted for all supported fp16 instructions on all GPU
|
||||
// variants, so we can use them instead.
|
||||
return ".b16";
|
||||
if (RC == &NVPTX::Float16x2RegsRegClass)
|
||||
return ".b32";
|
||||
if (RC == &NVPTX::Float64RegsRegClass)
|
||||
return ".f64";
|
||||
if (RC == &NVPTX::Int64RegsRegClass)
|
||||
@ -73,6 +75,8 @@ std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) {
|
||||
return "%f";
|
||||
if (RC == &NVPTX::Float16RegsRegClass)
|
||||
return "%h";
|
||||
if (RC == &NVPTX::Float16x2RegsRegClass)
|
||||
return "%hh";
|
||||
if (RC == &NVPTX::Float64RegsRegClass)
|
||||
return "%fd";
|
||||
if (RC == &NVPTX::Int64RegsRegClass)
|
||||
|
@ -37,6 +37,7 @@ foreach i = 0-4 in {
|
||||
def R#i : NVPTXReg<"%r"#i>; // 32-bit
|
||||
def RL#i : NVPTXReg<"%rd"#i>; // 64-bit
|
||||
def H#i : NVPTXReg<"%h"#i>; // 16-bit float
|
||||
def HH#i : NVPTXReg<"%hh"#i>; // 2x16-bit float
|
||||
def F#i : NVPTXReg<"%f"#i>; // 32-bit float
|
||||
def FL#i : NVPTXReg<"%fd"#i>; // 64-bit float
|
||||
|
||||
@ -59,6 +60,7 @@ def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%u", 0, 4))>;
|
||||
def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%u", 0, 4))>;
|
||||
def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4))>;
|
||||
def Float16Regs : NVPTXRegClass<[f16], 16, (add (sequence "H%u", 0, 4))>;
|
||||
def Float16x2Regs : NVPTXRegClass<[v2f16], 32, (add (sequence "HH%u", 0, 4))>;
|
||||
def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>;
|
||||
def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%u", 0, 4))>;
|
||||
def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%u", 0, 4))>;
|
||||
|
@ -15,3 +15,37 @@ define i32 @f(i32* %p) {
|
||||
%sum = add i32 %v0, %v1
|
||||
ret i32 %sum
|
||||
}
|
||||
|
||||
define half @fh(half* %p) {
|
||||
%p.1 = getelementptr half, half* %p, i32 1
|
||||
%p.2 = getelementptr half, half* %p, i32 2
|
||||
%p.3 = getelementptr half, half* %p, i32 3
|
||||
%p.4 = getelementptr half, half* %p, i32 4
|
||||
%v0 = load half, half* %p, align 64
|
||||
%v1 = load half, half* %p.1, align 4
|
||||
%v2 = load half, half* %p.2, align 4
|
||||
%v3 = load half, half* %p.3, align 4
|
||||
%v4 = load half, half* %p.4, align 4
|
||||
%sum1 = fadd half %v0, %v1
|
||||
%sum2 = fadd half %v2, %v3
|
||||
%sum3 = fadd half %sum1, %sum2
|
||||
%sum = fadd half %sum3, %v4
|
||||
ret half %sum
|
||||
}
|
||||
|
||||
define float @ff(float* %p) {
|
||||
%p.1 = getelementptr float, float* %p, i32 1
|
||||
%p.2 = getelementptr float, float* %p, i32 2
|
||||
%p.3 = getelementptr float, float* %p, i32 3
|
||||
%p.4 = getelementptr float, float* %p, i32 4
|
||||
%v0 = load float, float* %p, align 64
|
||||
%v1 = load float, float* %p.1, align 4
|
||||
%v2 = load float, float* %p.2, align 4
|
||||
%v3 = load float, float* %p.3, align 4
|
||||
%v4 = load float, float* %p.4, align 4
|
||||
%sum1 = fadd float %v0, %v1
|
||||
%sum2 = fadd float %v2, %v3
|
||||
%sum3 = fadd float %sum1, %sum2
|
||||
%sum = fadd float %sum3, %v4
|
||||
ret float %sum
|
||||
}
|
||||
|
@ -127,13 +127,13 @@ define half @test_fdiv(half %a, half %b) #0 {
|
||||
; CHECK-LABEL: test_frem(
|
||||
; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_frem_param_0];
|
||||
; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_frem_param_1];
|
||||
; CHECK-DAG: cvt.f32.f16 [[F0:%f[0-9]+]], [[A]];
|
||||
; CHECK-DAG: cvt.f32.f16 [[F1:%f[0-9]+]], [[B]];
|
||||
; CHECK-NEXT: div.rn.f32 [[F2:%f[0-9]+]], [[F0]], [[F1]];
|
||||
; CHECK-NEXT: cvt.rmi.f32.f32 [[F3:%f[0-9]+]], [[F2]];
|
||||
; CHECK-NEXT: mul.f32 [[F4:%f[0-9]+]], [[F3]], [[F1]];
|
||||
; CHECK-NEXT: sub.f32 [[F5:%f[0-9]+]], [[F0]], [[F4]];
|
||||
; CHECK-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[F5]];
|
||||
; CHECK-DAG: cvt.f32.f16 [[FA:%f[0-9]+]], [[A]];
|
||||
; CHECK-DAG: cvt.f32.f16 [[FB:%f[0-9]+]], [[B]];
|
||||
; CHECK-NEXT: div.rn.f32 [[D:%f[0-9]+]], [[FA]], [[FB]];
|
||||
; CHECK-NEXT: cvt.rmi.f32.f32 [[DI:%f[0-9]+]], [[D]];
|
||||
; CHECK-NEXT: mul.f32 [[RI:%f[0-9]+]], [[DI]], [[FB]];
|
||||
; CHECK-NEXT: sub.f32 [[RF:%f[0-9]+]], [[FA]], [[RI]];
|
||||
; CHECK-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]];
|
||||
; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]];
|
||||
; CHECK-NEXT: ret;
|
||||
define half @test_frem(half %a, half %b) #0 {
|
||||
@ -509,7 +509,7 @@ define i1 @test_fcmp_ord(half %a, half %b) #0 {
|
||||
; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
|
||||
; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
|
||||
; CHECK-NOF16: setp.lt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]]
|
||||
; CHECK-NEXT: @%p1 bra [[LABEL:LBB.*]];
|
||||
; CHECK-NEXT: @[[PRED]] bra [[LABEL:LBB.*]];
|
||||
; CHECK: st.u32 [%[[C]]],
|
||||
; CHECK: [[LABEL]]:
|
||||
; CHECK: st.u32 [%[[D]]],
|
||||
|
1433
test/CodeGen/NVPTX/f16x2-instructions.ll
Normal file
1433
test/CodeGen/NVPTX/f16x2-instructions.ll
Normal file
File diff suppressed because it is too large
Load Diff
@ -4,9 +4,9 @@
|
||||
%s_i1 = type { i1 }
|
||||
%s_i8 = type { i8 }
|
||||
%s_i16 = type { i16 }
|
||||
%s_half = type { half }
|
||||
%s_f16 = type { half }
|
||||
%s_i32 = type { i32 }
|
||||
%s_float = type { float }
|
||||
%s_f32 = type { float }
|
||||
%s_i64 = type { i64 }
|
||||
%s_f64 = type { double }
|
||||
|
||||
@ -322,22 +322,148 @@ define <5 x i16> @test_v5i16(<5 x i16> %a) {
|
||||
}
|
||||
|
||||
; CHECK: .func (.param .b32 func_retval0)
|
||||
; CHECK-LABEL: test_half(
|
||||
; CHECK-NEXT: .param .b32 test_half_param_0
|
||||
; CHECK: ld.param.b16 [[E:%h[0-9]+]], [test_half_param_0];
|
||||
; CHECK-LABEL: test_f16(
|
||||
; CHECK-NEXT: .param .b32 test_f16_param_0
|
||||
; CHECK: ld.param.b16 [[E:%h[0-9]+]], [test_f16_param_0];
|
||||
; CHECK: .param .b32 param0;
|
||||
; CHECK: st.param.b16 [param0+0], [[E]];
|
||||
; CHECK: .param .b32 retval0;
|
||||
; CHECK: call.uni (retval0),
|
||||
; CHECK-NEXT: test_half,
|
||||
; CHECK-NEXT: test_f16,
|
||||
; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0];
|
||||
; CHECK: st.param.b16 [func_retval0+0], [[R]]
|
||||
; CHECK-NEXT: ret;
|
||||
define half @test_half(half %a) {
|
||||
%r = tail call half @test_half(half %a);
|
||||
define half @test_f16(half %a) {
|
||||
%r = tail call half @test_f16(half %a);
|
||||
ret half %r;
|
||||
}
|
||||
|
||||
; CHECK: .func (.param .align 4 .b8 func_retval0[4])
|
||||
; CHECK-LABEL: test_v2f16(
|
||||
; CHECK-NEXT: .param .align 4 .b8 test_v2f16_param_0[4]
|
||||
; CHECK: ld.param.b32 [[E:%hh[0-9]+]], [test_v2f16_param_0];
|
||||
; CHECK: .param .align 4 .b8 param0[4];
|
||||
; CHECK: st.param.b32 [param0+0], [[E]];
|
||||
; CHECK: .param .align 4 .b8 retval0[4];
|
||||
; CHECK: call.uni (retval0),
|
||||
; CHECK-NEXT: test_v2f16,
|
||||
; CHECK: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0];
|
||||
; CHECK: st.param.b32 [func_retval0+0], [[R]]
|
||||
; CHECK-NEXT: ret;
|
||||
define <2 x half> @test_v2f16(<2 x half> %a) {
|
||||
%r = tail call <2 x half> @test_v2f16(<2 x half> %a);
|
||||
ret <2 x half> %r;
|
||||
}
|
||||
|
||||
; CHECK:.func (.param .align 8 .b8 func_retval0[8])
|
||||
; CHECK-LABEL: test_v3f16(
|
||||
; CHECK: .param .align 8 .b8 test_v3f16_param_0[8]
|
||||
; CHECK-DAG: ld.param.b32 [[HH01:%hh[0-9]+]], [test_v3f16_param_0];
|
||||
; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]];
|
||||
; CHECK-DAG: ld.param.b16 [[E2:%h[0-9]+]], [test_v3f16_param_0+4];
|
||||
; CHECK: .param .align 8 .b8 param0[8];
|
||||
; CHECK-DAG: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]};
|
||||
; CHECK-DAG: st.param.b16 [param0+4], [[E2]];
|
||||
; CHECK: .param .align 8 .b8 retval0[8];
|
||||
; CHECK: call.uni (retval0),
|
||||
; CHECK: test_v3f16,
|
||||
; CHECK-DAG: ld.param.v2.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]]}, [retval0+0];
|
||||
; CHECK-DAG: ld.param.b16 [[R2:%h[0-9]+]], [retval0+4];
|
||||
; CHECK-DAG: st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]};
|
||||
; CHECK-DAG: st.param.b16 [func_retval0+4], [[R2]];
|
||||
; CHECK: ret;
|
||||
define <3 x half> @test_v3f16(<3 x half> %a) {
|
||||
%r = tail call <3 x half> @test_v3f16(<3 x half> %a);
|
||||
ret <3 x half> %r;
|
||||
}
|
||||
|
||||
; CHECK:.func (.param .align 8 .b8 func_retval0[8])
|
||||
; CHECK-LABEL: test_v4f16(
|
||||
; CHECK: .param .align 8 .b8 test_v4f16_param_0[8]
|
||||
; CHECK: ld.param.v2.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0];
|
||||
; CHECK-DAG: mov.b32 [[HH01:%hh[0-9]+]], [[R01]];
|
||||
; CHECK-DAG: mov.b32 [[HH23:%hh[0-9]+]], [[R23]];
|
||||
; CHECK: .param .align 8 .b8 param0[8];
|
||||
; CHECK: st.param.v2.b32 [param0+0], {[[HH01]], [[HH23]]};
|
||||
; CHECK: .param .align 8 .b8 retval0[8];
|
||||
; CHECK: call.uni (retval0),
|
||||
; CHECK: test_v4f16,
|
||||
; CHECK: ld.param.v2.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]]}, [retval0+0];
|
||||
; CHECK: st.param.v2.b32 [func_retval0+0], {[[RH01]], [[RH23]]};
|
||||
; CHECK: ret;
|
||||
define <4 x half> @test_v4f16(<4 x half> %a) {
|
||||
%r = tail call <4 x half> @test_v4f16(<4 x half> %a);
|
||||
ret <4 x half> %r;
|
||||
}
|
||||
|
||||
; CHECK:.func (.param .align 16 .b8 func_retval0[16])
|
||||
; CHECK-LABEL: test_v5f16(
|
||||
; CHECK: .param .align 16 .b8 test_v5f16_param_0[16]
|
||||
; CHECK-DAG: ld.param.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v5f16_param_0];
|
||||
; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]];
|
||||
; CHECK-DAG: ld.param.b16 [[E4:%h[0-9]+]], [test_v5f16_param_0+8];
|
||||
; CHECK: .param .align 16 .b8 param0[16];
|
||||
; CHECK-DAG: st.param.v4.b16 [param0+0],
|
||||
; CHECK-DAG: st.param.b16 [param0+8], [[E4]];
|
||||
; CHECK: .param .align 16 .b8 retval0[16];
|
||||
; CHECK: call.uni (retval0),
|
||||
; CHECK: test_v5f16,
|
||||
; CHECK-DAG: ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0];
|
||||
; CHECK-DAG: ld.param.b16 [[R4:%h[0-9]+]], [retval0+8];
|
||||
; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]};
|
||||
; CHECK-DAG: st.param.b16 [func_retval0+8], [[R4]];
|
||||
; CHECK: ret;
|
||||
define <5 x half> @test_v5f16(<5 x half> %a) {
|
||||
%r = tail call <5 x half> @test_v5f16(<5 x half> %a);
|
||||
ret <5 x half> %r;
|
||||
}
|
||||
|
||||
; CHECK:.func (.param .align 16 .b8 func_retval0[16])
|
||||
; CHECK-LABEL: test_v8f16(
|
||||
; CHECK: .param .align 16 .b8 test_v8f16_param_0[16]
|
||||
; CHECK: ld.param.v4.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0];
|
||||
; CHECK-DAG: mov.b32 [[HH01:%hh[0-9]+]], [[R01]];
|
||||
; CHECK-DAG: mov.b32 [[HH23:%hh[0-9]+]], [[R23]];
|
||||
; CHECK-DAG: mov.b32 [[HH45:%hh[0-9]+]], [[R45]];
|
||||
; CHECK-DAG: mov.b32 [[HH67:%hh[0-9]+]], [[R67]];
|
||||
; CHECK: .param .align 16 .b8 param0[16];
|
||||
; CHECK: st.param.v4.b32 [param0+0], {[[HH01]], [[HH23]], [[HH45]], [[HH67]]};
|
||||
; CHECK: .param .align 16 .b8 retval0[16];
|
||||
; CHECK: call.uni (retval0),
|
||||
; CHECK: test_v8f16,
|
||||
; CHECK: ld.param.v4.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]], [[RH45:%hh[0-9]+]], [[RH67:%hh[0-9]+]]}, [retval0+0];
|
||||
; CHECK: st.param.v4.b32 [func_retval0+0], {[[RH01]], [[RH23]], [[RH45]], [[RH67]]};
|
||||
; CHECK: ret;
|
||||
define <8 x half> @test_v8f16(<8 x half> %a) {
|
||||
%r = tail call <8 x half> @test_v8f16(<8 x half> %a);
|
||||
ret <8 x half> %r;
|
||||
}
|
||||
|
||||
; CHECK:.func (.param .align 32 .b8 func_retval0[32])
|
||||
; CHECK-LABEL: test_v9f16(
|
||||
; CHECK: .param .align 32 .b8 test_v9f16_param_0[32]
|
||||
; CHECK-DAG: ld.param.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v9f16_param_0];
|
||||
; CHECK-DAG: ld.param.v4.b16 {[[E4:%h[0-9]+]], [[E5:%h[0-9]+]], [[E6:%h[0-9]+]], [[E7:%h[0-9]+]]}, [test_v9f16_param_0+8];
|
||||
; CHECK-DAG: ld.param.b16 [[E8:%h[0-9]+]], [test_v9f16_param_0+16];
|
||||
; CHECK: .param .align 32 .b8 param0[32];
|
||||
; CHECK-DAG: st.param.v4.b16 [param0+0],
|
||||
; CHECK-DAG: st.param.v4.b16 [param0+8],
|
||||
; CHECK-DAG: st.param.b16 [param0+16], [[E8]];
|
||||
; CHECK: .param .align 32 .b8 retval0[32];
|
||||
; CHECK: call.uni (retval0),
|
||||
; CHECK: test_v9f16,
|
||||
; CHECK-DAG: ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0];
|
||||
; CHECK-DAG: ld.param.v4.b16 {[[R4:%h[0-9]+]], [[R5:%h[0-9]+]], [[R6:%h[0-9]+]], [[R7:%h[0-9]+]]}, [retval0+8];
|
||||
; CHECK-DAG: ld.param.b16 [[R8:%h[0-9]+]], [retval0+16];
|
||||
; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]};
|
||||
; CHECK-DAG: st.param.v4.b16 [func_retval0+8], {[[R4]], [[R5]], [[R6]], [[R7]]};
|
||||
; CHECK-DAG: st.param.b16 [func_retval0+16], [[R8]];
|
||||
; CHECK: ret;
|
||||
define <9 x half> @test_v9f16(<9 x half> %a) {
|
||||
%r = tail call <9 x half> @test_v9f16(<9 x half> %a);
|
||||
ret <9 x half> %r;
|
||||
}
|
||||
|
||||
; CHECK: .func (.param .b32 func_retval0)
|
||||
; CHECK-LABEL: test_i32(
|
||||
; CHECK-NEXT: .param .b32 test_i32_param_0
|
||||
@ -415,19 +541,19 @@ define <5 x i32> @test_v5i32(<5 x i32> %a) {
|
||||
}
|
||||
|
||||
; CHECK: .func (.param .b32 func_retval0)
|
||||
; CHECK-LABEL: test_float(
|
||||
; CHECK-NEXT: .param .b32 test_float_param_0
|
||||
; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_float_param_0];
|
||||
; CHECK-LABEL: test_f32(
|
||||
; CHECK-NEXT: .param .b32 test_f32_param_0
|
||||
; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_f32_param_0];
|
||||
; CHECK: .param .b32 param0;
|
||||
; CHECK: st.param.f32 [param0+0], [[E]];
|
||||
; CHECK: .param .b32 retval0;
|
||||
; CHECK: call.uni (retval0),
|
||||
; CHECK-NEXT: test_float,
|
||||
; CHECK-NEXT: test_f32,
|
||||
; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0];
|
||||
; CHECK: st.param.f32 [func_retval0+0], [[R]];
|
||||
; CHECK-NEXT: ret;
|
||||
define float @test_float(float %a) {
|
||||
%r = tail call float @test_float(float %a);
|
||||
define float @test_f32(float %a) {
|
||||
%r = tail call float @test_f32(float %a);
|
||||
ret float %r;
|
||||
}
|
||||
|
||||
@ -547,20 +673,20 @@ define %s_i16 @test_s_i16(%s_i16 %a) {
|
||||
}
|
||||
|
||||
; CHECK: .func (.param .align 2 .b8 func_retval0[2])
|
||||
; CHECK-LABEL: test_s_half(
|
||||
; CHECK-NEXT: .param .align 2 .b8 test_s_half_param_0[2]
|
||||
; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_s_half_param_0];
|
||||
; CHECK-LABEL: test_s_f16(
|
||||
; CHECK-NEXT: .param .align 2 .b8 test_s_f16_param_0[2]
|
||||
; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_s_f16_param_0];
|
||||
; CHECK: .param .align 2 .b8 param0[2];
|
||||
; CHECK: st.param.b16 [param0+0], [[A]]
|
||||
; CHECK: .param .align 2 .b8 retval0[2];
|
||||
; CHECK: call.uni
|
||||
; CHECK-NEXT: test_s_half,
|
||||
; CHECK-NEXT: test_s_f16,
|
||||
; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0];
|
||||
; CHECK: st.param.b16 [func_retval0+0], [[R]];
|
||||
; CHECK-NEXT: ret;
|
||||
define %s_half @test_s_half(%s_half %a) {
|
||||
%r = tail call %s_half @test_s_half(%s_half %a);
|
||||
ret %s_half %r;
|
||||
define %s_f16 @test_s_f16(%s_f16 %a) {
|
||||
%r = tail call %s_f16 @test_s_f16(%s_f16 %a);
|
||||
ret %s_f16 %r;
|
||||
}
|
||||
|
||||
; CHECK: .func (.param .align 4 .b8 func_retval0[4])
|
||||
@ -581,20 +707,20 @@ define %s_i32 @test_s_i32(%s_i32 %a) {
|
||||
}
|
||||
|
||||
; CHECK: .func (.param .align 4 .b8 func_retval0[4])
|
||||
; CHECK-LABEL: test_s_float(
|
||||
; CHECK-NEXT: .param .align 4 .b8 test_s_float_param_0[4]
|
||||
; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_s_float_param_0];
|
||||
; CHECK-LABEL: test_s_f32(
|
||||
; CHECK-NEXT: .param .align 4 .b8 test_s_f32_param_0[4]
|
||||
; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_s_f32_param_0];
|
||||
; CHECK: .param .align 4 .b8 param0[4]
|
||||
; CHECK: st.param.f32 [param0+0], [[E]];
|
||||
; CHECK: .param .align 4 .b8 retval0[4];
|
||||
; CHECK: call.uni (retval0),
|
||||
; CHECK-NEXT: test_s_float,
|
||||
; CHECK-NEXT: test_s_f32,
|
||||
; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0];
|
||||
; CHECK: st.param.f32 [func_retval0+0], [[R]];
|
||||
; CHECK-NEXT: ret;
|
||||
define %s_float @test_s_float(%s_float %a) {
|
||||
%r = tail call %s_float @test_s_float(%s_float %a);
|
||||
ret %s_float %r;
|
||||
define %s_f32 @test_s_f32(%s_f32 %a) {
|
||||
%r = tail call %s_f32 @test_s_f32(%s_f32 %a);
|
||||
ret %s_f32 %r;
|
||||
}
|
||||
|
||||
; CHECK: .func (.param .align 8 .b8 func_retval0[8])
|
||||
|
Loading…
Reference in New Issue
Block a user