mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-25 12:12:47 +01:00
AMDGPU: Move zeroed FP high bits optimization to patterns
This commit is contained in:
parent
cbac628d6a
commit
50b757aa13
@ -44,6 +44,63 @@ class R600InstrInfo;
|
|||||||
|
|
||||||
namespace {
|
namespace {
|
||||||
|
|
||||||
|
// Instructions that will be lowered with a final instruction that zeros the
|
||||||
|
// high result bits.
|
||||||
|
// XXX - only need to list legal operations.
|
||||||
|
static bool fp16SrcZerosHighBits(unsigned Opc) {
|
||||||
|
switch (Opc) {
|
||||||
|
case ISD::FADD:
|
||||||
|
case ISD::FSUB:
|
||||||
|
case ISD::FMUL:
|
||||||
|
case ISD::FDIV:
|
||||||
|
case ISD::FREM:
|
||||||
|
case ISD::FMA:
|
||||||
|
case ISD::FMAD:
|
||||||
|
case ISD::FCANONICALIZE:
|
||||||
|
case ISD::FP_ROUND:
|
||||||
|
case ISD::UINT_TO_FP:
|
||||||
|
case ISD::SINT_TO_FP:
|
||||||
|
case ISD::FABS:
|
||||||
|
// Fabs is lowered to a bit operation, but it's an and which will clear the
|
||||||
|
// high bits anyway.
|
||||||
|
case ISD::FSQRT:
|
||||||
|
case ISD::FSIN:
|
||||||
|
case ISD::FCOS:
|
||||||
|
case ISD::FPOWI:
|
||||||
|
case ISD::FPOW:
|
||||||
|
case ISD::FLOG:
|
||||||
|
case ISD::FLOG2:
|
||||||
|
case ISD::FLOG10:
|
||||||
|
case ISD::FEXP:
|
||||||
|
case ISD::FEXP2:
|
||||||
|
case ISD::FCEIL:
|
||||||
|
case ISD::FTRUNC:
|
||||||
|
case ISD::FRINT:
|
||||||
|
case ISD::FNEARBYINT:
|
||||||
|
case ISD::FROUND:
|
||||||
|
case ISD::FFLOOR:
|
||||||
|
case ISD::FMINNUM:
|
||||||
|
case ISD::FMAXNUM:
|
||||||
|
case AMDGPUISD::FRACT:
|
||||||
|
case AMDGPUISD::CLAMP:
|
||||||
|
case AMDGPUISD::COS_HW:
|
||||||
|
case AMDGPUISD::SIN_HW:
|
||||||
|
case AMDGPUISD::FMIN3:
|
||||||
|
case AMDGPUISD::FMAX3:
|
||||||
|
case AMDGPUISD::FMED3:
|
||||||
|
case AMDGPUISD::FMAD_FTZ:
|
||||||
|
case AMDGPUISD::RCP:
|
||||||
|
case AMDGPUISD::RSQ:
|
||||||
|
case AMDGPUISD::RCP_IFLAG:
|
||||||
|
case AMDGPUISD::LDEXP:
|
||||||
|
return true;
|
||||||
|
default:
|
||||||
|
// fcopysign, select and others may be lowered to 32-bit bit operations
|
||||||
|
// which don't zero the high bits.
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static bool isNullConstantOrUndef(SDValue V) {
|
static bool isNullConstantOrUndef(SDValue V) {
|
||||||
if (V.isUndef())
|
if (V.isUndef())
|
||||||
return true;
|
return true;
|
||||||
|
@ -4353,7 +4353,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|||||||
NODE_NAME_CASE(CVT_PK_I16_I32)
|
NODE_NAME_CASE(CVT_PK_I16_I32)
|
||||||
NODE_NAME_CASE(CVT_PK_U16_U32)
|
NODE_NAME_CASE(CVT_PK_U16_U32)
|
||||||
NODE_NAME_CASE(FP_TO_FP16)
|
NODE_NAME_CASE(FP_TO_FP16)
|
||||||
NODE_NAME_CASE(FP16_ZEXT)
|
|
||||||
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
|
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
|
||||||
NODE_NAME_CASE(CONST_DATA_PTR)
|
NODE_NAME_CASE(CONST_DATA_PTR)
|
||||||
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
|
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
|
||||||
@ -4483,8 +4482,7 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
|
|||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case AMDGPUISD::FP_TO_FP16:
|
case AMDGPUISD::FP_TO_FP16: {
|
||||||
case AMDGPUISD::FP16_ZEXT: {
|
|
||||||
unsigned BitWidth = Known.getBitWidth();
|
unsigned BitWidth = Known.getBitWidth();
|
||||||
|
|
||||||
// High bits are zero.
|
// High bits are zero.
|
||||||
@ -4631,7 +4629,6 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
|
|||||||
case AMDGPUISD::BUFFER_LOAD_USHORT:
|
case AMDGPUISD::BUFFER_LOAD_USHORT:
|
||||||
return 16;
|
return 16;
|
||||||
case AMDGPUISD::FP_TO_FP16:
|
case AMDGPUISD::FP_TO_FP16:
|
||||||
case AMDGPUISD::FP16_ZEXT:
|
|
||||||
return 16;
|
return 16;
|
||||||
default:
|
default:
|
||||||
return 1;
|
return 1;
|
||||||
|
@ -457,9 +457,6 @@ enum NodeType : unsigned {
|
|||||||
// are known 0.
|
// are known 0.
|
||||||
FP_TO_FP16,
|
FP_TO_FP16,
|
||||||
|
|
||||||
// Wrapper around fp16 results that are known to zero the high bits.
|
|
||||||
FP16_ZEXT,
|
|
||||||
|
|
||||||
/// This node is for VLIW targets and it is used to represent a vector
|
/// This node is for VLIW targets and it is used to represent a vector
|
||||||
/// that is stored in consecutive registers with the same channel.
|
/// that is stored in consecutive registers with the same channel.
|
||||||
/// For example:
|
/// For example:
|
||||||
|
@ -132,7 +132,6 @@ def AMDGPUpknorm_u16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFP
|
|||||||
def AMDGPUpk_i16_i32_impl : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
|
def AMDGPUpk_i16_i32_impl : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
|
||||||
def AMDGPUpk_u16_u32_impl : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
|
def AMDGPUpk_u16_u32_impl : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
|
||||||
def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
|
def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
|
||||||
def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
|
|
||||||
|
|
||||||
|
|
||||||
def AMDGPUfp_class_impl : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
|
def AMDGPUfp_class_impl : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
|
||||||
|
@ -9375,63 +9375,6 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
|
|||||||
return SDValue();
|
return SDValue();
|
||||||
}
|
}
|
||||||
|
|
||||||
// Instructions that will be lowered with a final instruction that zeros the
|
|
||||||
// high result bits.
|
|
||||||
// XXX - probably only need to list legal operations.
|
|
||||||
static bool fp16SrcZerosHighBits(unsigned Opc) {
|
|
||||||
switch (Opc) {
|
|
||||||
case ISD::FADD:
|
|
||||||
case ISD::FSUB:
|
|
||||||
case ISD::FMUL:
|
|
||||||
case ISD::FDIV:
|
|
||||||
case ISD::FREM:
|
|
||||||
case ISD::FMA:
|
|
||||||
case ISD::FMAD:
|
|
||||||
case ISD::FCANONICALIZE:
|
|
||||||
case ISD::FP_ROUND:
|
|
||||||
case ISD::UINT_TO_FP:
|
|
||||||
case ISD::SINT_TO_FP:
|
|
||||||
case ISD::FABS:
|
|
||||||
// Fabs is lowered to a bit operation, but it's an and which will clear the
|
|
||||||
// high bits anyway.
|
|
||||||
case ISD::FSQRT:
|
|
||||||
case ISD::FSIN:
|
|
||||||
case ISD::FCOS:
|
|
||||||
case ISD::FPOWI:
|
|
||||||
case ISD::FPOW:
|
|
||||||
case ISD::FLOG:
|
|
||||||
case ISD::FLOG2:
|
|
||||||
case ISD::FLOG10:
|
|
||||||
case ISD::FEXP:
|
|
||||||
case ISD::FEXP2:
|
|
||||||
case ISD::FCEIL:
|
|
||||||
case ISD::FTRUNC:
|
|
||||||
case ISD::FRINT:
|
|
||||||
case ISD::FNEARBYINT:
|
|
||||||
case ISD::FROUND:
|
|
||||||
case ISD::FFLOOR:
|
|
||||||
case ISD::FMINNUM:
|
|
||||||
case ISD::FMAXNUM:
|
|
||||||
case AMDGPUISD::FRACT:
|
|
||||||
case AMDGPUISD::CLAMP:
|
|
||||||
case AMDGPUISD::COS_HW:
|
|
||||||
case AMDGPUISD::SIN_HW:
|
|
||||||
case AMDGPUISD::FMIN3:
|
|
||||||
case AMDGPUISD::FMAX3:
|
|
||||||
case AMDGPUISD::FMED3:
|
|
||||||
case AMDGPUISD::FMAD_FTZ:
|
|
||||||
case AMDGPUISD::RCP:
|
|
||||||
case AMDGPUISD::RSQ:
|
|
||||||
case AMDGPUISD::RCP_IFLAG:
|
|
||||||
case AMDGPUISD::LDEXP:
|
|
||||||
return true;
|
|
||||||
default:
|
|
||||||
// fcopysign, select and others may be lowered to 32-bit bit operations
|
|
||||||
// which don't zero the high bits.
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
|
SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
|
||||||
DAGCombinerInfo &DCI) const {
|
DAGCombinerInfo &DCI) const {
|
||||||
if (!Subtarget->has16BitInsts() ||
|
if (!Subtarget->has16BitInsts() ||
|
||||||
@ -9446,15 +9389,6 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
|
|||||||
if (Src.getValueType() != MVT::i16)
|
if (Src.getValueType() != MVT::i16)
|
||||||
return SDValue();
|
return SDValue();
|
||||||
|
|
||||||
// (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
|
|
||||||
// FIXME: It is not universally true that the high bits are zeroed on gfx9.
|
|
||||||
if (Src.getOpcode() == ISD::BITCAST) {
|
|
||||||
SDValue BCSrc = Src.getOperand(0);
|
|
||||||
if (BCSrc.getValueType() == MVT::f16 &&
|
|
||||||
fp16SrcZerosHighBits(BCSrc.getOpcode()))
|
|
||||||
return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
|
|
||||||
}
|
|
||||||
|
|
||||||
return SDValue();
|
return SDValue();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -814,6 +814,12 @@ def NegSubInlineConstV216 : PatLeaf<(build_vector), [{
|
|||||||
(isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode()));
|
(isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode()));
|
||||||
}], getNegV2I16Imm>;
|
}], getNegV2I16Imm>;
|
||||||
|
|
||||||
|
|
||||||
|
def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{
|
||||||
|
return fp16SrcZerosHighBits(N->getOpcode());
|
||||||
|
}]>;
|
||||||
|
|
||||||
|
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
// MUBUF/SMEM Patterns
|
// MUBUF/SMEM Patterns
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
|
@ -1992,11 +1992,13 @@ def : GCNPat <
|
|||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
// Miscellaneous Patterns
|
// Miscellaneous Patterns
|
||||||
//===----------------------------------------------------------------------===//
|
//===----------------------------------------------------------------------===//
|
||||||
def : GCNPat <
|
|
||||||
(i32 (AMDGPUfp16_zext f16:$src)),
|
|
||||||
(COPY $src)
|
|
||||||
>;
|
|
||||||
|
|
||||||
|
// Eliminate a zero extension from an fp16 operation if it already
|
||||||
|
// zeros the high bits of the 32-bit register.
|
||||||
|
def : GCNPat<
|
||||||
|
(i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
|
||||||
|
(COPY VSrc_b16:$src)
|
||||||
|
>;
|
||||||
|
|
||||||
def : GCNPat <
|
def : GCNPat <
|
||||||
(i32 (trunc i64:$a)),
|
(i32 (trunc i64:$a)),
|
||||||
|
Loading…
Reference in New Issue
Block a user