1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-25 12:12:47 +01:00

AMDGPU: Move zeroed FP high bits optimization to patterns

This commit is contained in:
Matt Arsenault 2021-06-15 17:12:02 -04:00
parent cbac628d6a
commit 50b757aa13
7 changed files with 70 additions and 78 deletions

View File

@ -44,6 +44,63 @@ class R600InstrInfo;
namespace {
// Instructions that will be lowered with a final instruction that zeros the
// high result bits.
// XXX - only need to list legal operations.
static bool fp16SrcZerosHighBits(unsigned Opc) {
switch (Opc) {
case ISD::FADD:
case ISD::FSUB:
case ISD::FMUL:
case ISD::FDIV:
case ISD::FREM:
case ISD::FMA:
case ISD::FMAD:
case ISD::FCANONICALIZE:
case ISD::FP_ROUND:
case ISD::UINT_TO_FP:
case ISD::SINT_TO_FP:
case ISD::FABS:
// Fabs is lowered to a bit operation, but it's an and which will clear the
// high bits anyway.
case ISD::FSQRT:
case ISD::FSIN:
case ISD::FCOS:
case ISD::FPOWI:
case ISD::FPOW:
case ISD::FLOG:
case ISD::FLOG2:
case ISD::FLOG10:
case ISD::FEXP:
case ISD::FEXP2:
case ISD::FCEIL:
case ISD::FTRUNC:
case ISD::FRINT:
case ISD::FNEARBYINT:
case ISD::FROUND:
case ISD::FFLOOR:
case ISD::FMINNUM:
case ISD::FMAXNUM:
case AMDGPUISD::FRACT:
case AMDGPUISD::CLAMP:
case AMDGPUISD::COS_HW:
case AMDGPUISD::SIN_HW:
case AMDGPUISD::FMIN3:
case AMDGPUISD::FMAX3:
case AMDGPUISD::FMED3:
case AMDGPUISD::FMAD_FTZ:
case AMDGPUISD::RCP:
case AMDGPUISD::RSQ:
case AMDGPUISD::RCP_IFLAG:
case AMDGPUISD::LDEXP:
return true;
default:
// fcopysign, select and others may be lowered to 32-bit bit operations
// which don't zero the high bits.
return false;
}
}
static bool isNullConstantOrUndef(SDValue V) {
if (V.isUndef())
return true;

View File

@ -4353,7 +4353,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CVT_PK_I16_I32)
NODE_NAME_CASE(CVT_PK_U16_U32)
NODE_NAME_CASE(FP_TO_FP16)
NODE_NAME_CASE(FP16_ZEXT)
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
NODE_NAME_CASE(CONST_DATA_PTR)
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
@ -4483,8 +4482,7 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
break;
}
case AMDGPUISD::FP_TO_FP16:
case AMDGPUISD::FP16_ZEXT: {
case AMDGPUISD::FP_TO_FP16: {
unsigned BitWidth = Known.getBitWidth();
// High bits are zero.
@ -4631,7 +4629,6 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
case AMDGPUISD::BUFFER_LOAD_USHORT:
return 16;
case AMDGPUISD::FP_TO_FP16:
case AMDGPUISD::FP16_ZEXT:
return 16;
default:
return 1;

View File

@ -457,9 +457,6 @@ enum NodeType : unsigned {
// are known 0.
FP_TO_FP16,
// Wrapper around fp16 results that are known to zero the high bits.
FP16_ZEXT,
/// This node is for VLIW targets and it is used to represent a vector
/// that is stored in consecutive registers with the same channel.
/// For example:

View File

@ -132,7 +132,6 @@ def AMDGPUpknorm_u16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFP
def AMDGPUpk_i16_i32_impl : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
def AMDGPUpk_u16_u32_impl : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
def AMDGPUfp_class_impl : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;

View File

@ -9375,63 +9375,6 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
return SDValue();
}
// Instructions that will be lowered with a final instruction that zeros the
// high result bits.
// XXX - probably only need to list legal operations.
static bool fp16SrcZerosHighBits(unsigned Opc) {
switch (Opc) {
case ISD::FADD:
case ISD::FSUB:
case ISD::FMUL:
case ISD::FDIV:
case ISD::FREM:
case ISD::FMA:
case ISD::FMAD:
case ISD::FCANONICALIZE:
case ISD::FP_ROUND:
case ISD::UINT_TO_FP:
case ISD::SINT_TO_FP:
case ISD::FABS:
// Fabs is lowered to a bit operation, but it's an and which will clear the
// high bits anyway.
case ISD::FSQRT:
case ISD::FSIN:
case ISD::FCOS:
case ISD::FPOWI:
case ISD::FPOW:
case ISD::FLOG:
case ISD::FLOG2:
case ISD::FLOG10:
case ISD::FEXP:
case ISD::FEXP2:
case ISD::FCEIL:
case ISD::FTRUNC:
case ISD::FRINT:
case ISD::FNEARBYINT:
case ISD::FROUND:
case ISD::FFLOOR:
case ISD::FMINNUM:
case ISD::FMAXNUM:
case AMDGPUISD::FRACT:
case AMDGPUISD::CLAMP:
case AMDGPUISD::COS_HW:
case AMDGPUISD::SIN_HW:
case AMDGPUISD::FMIN3:
case AMDGPUISD::FMAX3:
case AMDGPUISD::FMED3:
case AMDGPUISD::FMAD_FTZ:
case AMDGPUISD::RCP:
case AMDGPUISD::RSQ:
case AMDGPUISD::RCP_IFLAG:
case AMDGPUISD::LDEXP:
return true;
default:
// fcopysign, select and others may be lowered to 32-bit bit operations
// which don't zero the high bits.
return false;
}
}
SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (!Subtarget->has16BitInsts() ||
@ -9446,15 +9389,6 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
if (Src.getValueType() != MVT::i16)
return SDValue();
// (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
// FIXME: It is not universally true that the high bits are zeroed on gfx9.
if (Src.getOpcode() == ISD::BITCAST) {
SDValue BCSrc = Src.getOperand(0);
if (BCSrc.getValueType() == MVT::f16 &&
fp16SrcZerosHighBits(BCSrc.getOpcode()))
return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
}
return SDValue();
}

View File

@ -814,6 +814,12 @@ def NegSubInlineConstV216 : PatLeaf<(build_vector), [{
(isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode()));
}], getNegV2I16Imm>;
def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{
return fp16SrcZerosHighBits(N->getOpcode());
}]>;
//===----------------------------------------------------------------------===//
// MUBUF/SMEM Patterns
//===----------------------------------------------------------------------===//

View File

@ -1992,11 +1992,13 @@ def : GCNPat <
//===----------------------------------------------------------------------===//
// Miscellaneous Patterns
//===----------------------------------------------------------------------===//
def : GCNPat <
(i32 (AMDGPUfp16_zext f16:$src)),
(COPY $src)
>;
// Eliminate a zero extension from an fp16 operation if it already
// zeros the high bits of the 32-bit register.
def : GCNPat<
(i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
(COPY VSrc_b16:$src)
>;
def : GCNPat <
(i32 (trunc i64:$a)),