1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-25 12:12:47 +01:00

AMDGPU: Move zeroed FP high bits optimization to patterns

This commit is contained in:
Matt Arsenault 2021-06-15 17:12:02 -04:00
parent cbac628d6a
commit 50b757aa13
7 changed files with 70 additions and 78 deletions

View File

@ -44,6 +44,63 @@ class R600InstrInfo;
namespace { namespace {
// Instructions that will be lowered with a final instruction that zeros the
// high result bits.
// XXX - only need to list legal operations.
static bool fp16SrcZerosHighBits(unsigned Opc) {
switch (Opc) {
case ISD::FADD:
case ISD::FSUB:
case ISD::FMUL:
case ISD::FDIV:
case ISD::FREM:
case ISD::FMA:
case ISD::FMAD:
case ISD::FCANONICALIZE:
case ISD::FP_ROUND:
case ISD::UINT_TO_FP:
case ISD::SINT_TO_FP:
case ISD::FABS:
// Fabs is lowered to a bit operation, but it's an and which will clear the
// high bits anyway.
case ISD::FSQRT:
case ISD::FSIN:
case ISD::FCOS:
case ISD::FPOWI:
case ISD::FPOW:
case ISD::FLOG:
case ISD::FLOG2:
case ISD::FLOG10:
case ISD::FEXP:
case ISD::FEXP2:
case ISD::FCEIL:
case ISD::FTRUNC:
case ISD::FRINT:
case ISD::FNEARBYINT:
case ISD::FROUND:
case ISD::FFLOOR:
case ISD::FMINNUM:
case ISD::FMAXNUM:
case AMDGPUISD::FRACT:
case AMDGPUISD::CLAMP:
case AMDGPUISD::COS_HW:
case AMDGPUISD::SIN_HW:
case AMDGPUISD::FMIN3:
case AMDGPUISD::FMAX3:
case AMDGPUISD::FMED3:
case AMDGPUISD::FMAD_FTZ:
case AMDGPUISD::RCP:
case AMDGPUISD::RSQ:
case AMDGPUISD::RCP_IFLAG:
case AMDGPUISD::LDEXP:
return true;
default:
// fcopysign, select and others may be lowered to 32-bit bit operations
// which don't zero the high bits.
return false;
}
}
static bool isNullConstantOrUndef(SDValue V) { static bool isNullConstantOrUndef(SDValue V) {
if (V.isUndef()) if (V.isUndef())
return true; return true;

View File

@ -4353,7 +4353,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CVT_PK_I16_I32) NODE_NAME_CASE(CVT_PK_I16_I32)
NODE_NAME_CASE(CVT_PK_U16_U32) NODE_NAME_CASE(CVT_PK_U16_U32)
NODE_NAME_CASE(FP_TO_FP16) NODE_NAME_CASE(FP_TO_FP16)
NODE_NAME_CASE(FP16_ZEXT)
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR) NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(CONST_DATA_PTR)
NODE_NAME_CASE(PC_ADD_REL_OFFSET) NODE_NAME_CASE(PC_ADD_REL_OFFSET)
@ -4483,8 +4482,7 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
break; break;
} }
case AMDGPUISD::FP_TO_FP16: case AMDGPUISD::FP_TO_FP16: {
case AMDGPUISD::FP16_ZEXT: {
unsigned BitWidth = Known.getBitWidth(); unsigned BitWidth = Known.getBitWidth();
// High bits are zero. // High bits are zero.
@ -4631,7 +4629,6 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
case AMDGPUISD::BUFFER_LOAD_USHORT: case AMDGPUISD::BUFFER_LOAD_USHORT:
return 16; return 16;
case AMDGPUISD::FP_TO_FP16: case AMDGPUISD::FP_TO_FP16:
case AMDGPUISD::FP16_ZEXT:
return 16; return 16;
default: default:
return 1; return 1;

View File

@ -457,9 +457,6 @@ enum NodeType : unsigned {
// are known 0. // are known 0.
FP_TO_FP16, FP_TO_FP16,
// Wrapper around fp16 results that are known to zero the high bits.
FP16_ZEXT,
/// This node is for VLIW targets and it is used to represent a vector /// This node is for VLIW targets and it is used to represent a vector
/// that is stored in consecutive registers with the same channel. /// that is stored in consecutive registers with the same channel.
/// For example: /// For example:

View File

@ -132,7 +132,6 @@ def AMDGPUpknorm_u16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFP
def AMDGPUpk_i16_i32_impl : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>; def AMDGPUpk_i16_i32_impl : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
def AMDGPUpk_u16_u32_impl : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>; def AMDGPUpk_u16_u32_impl : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>; def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
def AMDGPUfp_class_impl : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>; def AMDGPUfp_class_impl : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;

View File

@ -9375,63 +9375,6 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
return SDValue(); return SDValue();
} }
// Instructions that will be lowered with a final instruction that zeros the
// high result bits.
// XXX - probably only need to list legal operations.
static bool fp16SrcZerosHighBits(unsigned Opc) {
switch (Opc) {
case ISD::FADD:
case ISD::FSUB:
case ISD::FMUL:
case ISD::FDIV:
case ISD::FREM:
case ISD::FMA:
case ISD::FMAD:
case ISD::FCANONICALIZE:
case ISD::FP_ROUND:
case ISD::UINT_TO_FP:
case ISD::SINT_TO_FP:
case ISD::FABS:
// Fabs is lowered to a bit operation, but it's an and which will clear the
// high bits anyway.
case ISD::FSQRT:
case ISD::FSIN:
case ISD::FCOS:
case ISD::FPOWI:
case ISD::FPOW:
case ISD::FLOG:
case ISD::FLOG2:
case ISD::FLOG10:
case ISD::FEXP:
case ISD::FEXP2:
case ISD::FCEIL:
case ISD::FTRUNC:
case ISD::FRINT:
case ISD::FNEARBYINT:
case ISD::FROUND:
case ISD::FFLOOR:
case ISD::FMINNUM:
case ISD::FMAXNUM:
case AMDGPUISD::FRACT:
case AMDGPUISD::CLAMP:
case AMDGPUISD::COS_HW:
case AMDGPUISD::SIN_HW:
case AMDGPUISD::FMIN3:
case AMDGPUISD::FMAX3:
case AMDGPUISD::FMED3:
case AMDGPUISD::FMAD_FTZ:
case AMDGPUISD::RCP:
case AMDGPUISD::RSQ:
case AMDGPUISD::RCP_IFLAG:
case AMDGPUISD::LDEXP:
return true;
default:
// fcopysign, select and others may be lowered to 32-bit bit operations
// which don't zero the high bits.
return false;
}
}
SDValue SITargetLowering::performZeroExtendCombine(SDNode *N, SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
DAGCombinerInfo &DCI) const { DAGCombinerInfo &DCI) const {
if (!Subtarget->has16BitInsts() || if (!Subtarget->has16BitInsts() ||
@ -9446,15 +9389,6 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
if (Src.getValueType() != MVT::i16) if (Src.getValueType() != MVT::i16)
return SDValue(); return SDValue();
// (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
// FIXME: It is not universally true that the high bits are zeroed on gfx9.
if (Src.getOpcode() == ISD::BITCAST) {
SDValue BCSrc = Src.getOperand(0);
if (BCSrc.getValueType() == MVT::f16 &&
fp16SrcZerosHighBits(BCSrc.getOpcode()))
return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
}
return SDValue(); return SDValue();
} }

View File

@ -814,6 +814,12 @@ def NegSubInlineConstV216 : PatLeaf<(build_vector), [{
(isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode())); (isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode()));
}], getNegV2I16Imm>; }], getNegV2I16Imm>;
def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{
return fp16SrcZerosHighBits(N->getOpcode());
}]>;
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
// MUBUF/SMEM Patterns // MUBUF/SMEM Patterns
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//

View File

@ -1992,11 +1992,13 @@ def : GCNPat <
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
// Miscellaneous Patterns // Miscellaneous Patterns
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
def : GCNPat <
(i32 (AMDGPUfp16_zext f16:$src)),
(COPY $src)
>;
// Eliminate a zero extension from an fp16 operation if it already
// zeros the high bits of the 32-bit register.
def : GCNPat<
(i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
(COPY VSrc_b16:$src)
>;
def : GCNPat < def : GCNPat <
(i32 (trunc i64:$a)), (i32 (trunc i64:$a)),