AMDGPU: Move zeroed FP high bits optimization to patterns

2024-11-25 12:12:47 +01:00 · 2021-06-15 17:12:02 -04:00 · 2021-06-15 17:12:02 -04:00 · 50b757aa13
commit 50b757aa13
parent cbac628d6a
7 changed files with 70 additions and 78 deletions
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@ -44,6 +44,63 @@ class R600InstrInfo;
 namespace {
 // Instructions that will be lowered with a final instruction that zeros the
 // high result bits.
 // XXX - only need to list legal operations.
 static bool fp16SrcZerosHighBits(unsigned Opc) {
  switch (Opc) {
  case ISD::FADD:
  case ISD::FSUB:
  case ISD::FMUL:
  case ISD::FDIV:
  case ISD::FREM:
  case ISD::FMA:
  case ISD::FMAD:
  case ISD::FCANONICALIZE:
  case ISD::FP_ROUND:
  case ISD::UINT_TO_FP:
  case ISD::SINT_TO_FP:
  case ISD::FABS:
    // Fabs is lowered to a bit operation, but it's an and which will clear the
    // high bits anyway.
  case ISD::FSQRT:
  case ISD::FSIN:
  case ISD::FCOS:
  case ISD::FPOWI:
  case ISD::FPOW:
  case ISD::FLOG:
  case ISD::FLOG2:
  case ISD::FLOG10:
  case ISD::FEXP:
  case ISD::FEXP2:
  case ISD::FCEIL:
  case ISD::FTRUNC:
  case ISD::FRINT:
  case ISD::FNEARBYINT:
  case ISD::FROUND:
  case ISD::FFLOOR:
  case ISD::FMINNUM:
  case ISD::FMAXNUM:
  case AMDGPUISD::FRACT:
  case AMDGPUISD::CLAMP:
  case AMDGPUISD::COS_HW:
  case AMDGPUISD::SIN_HW:
  case AMDGPUISD::FMIN3:
  case AMDGPUISD::FMAX3:
  case AMDGPUISD::FMED3:
  case AMDGPUISD::FMAD_FTZ:
  case AMDGPUISD::RCP:
  case AMDGPUISD::RSQ:
  case AMDGPUISD::RCP_IFLAG:
  case AMDGPUISD::LDEXP:
    return true;
  default:
    // fcopysign, select and others may be lowered to 32-bit bit operations
    // which don't zero the high bits.
    return false;
  }
 }
 static bool isNullConstantOrUndef(SDValue V) {
  if (V.isUndef())
    return true;
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@ -4353,7 +4353,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
  NODE_NAME_CASE(CVT_PK_I16_I32)
  NODE_NAME_CASE(CVT_PK_U16_U32)
  NODE_NAME_CASE(FP_TO_FP16)
  NODE_NAME_CASE(FP16_ZEXT)
  NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
  NODE_NAME_CASE(CONST_DATA_PTR)
  NODE_NAME_CASE(PC_ADD_REL_OFFSET)
@ -4483,8 +4482,7 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
    break;
  }
-  case AMDGPUISD::FP_TO_FP16:
+  case AMDGPUISD::FP_TO_FP16: {
  case AMDGPUISD::FP16_ZEXT: {
    unsigned BitWidth = Known.getBitWidth();
    // High bits are zero.
@ -4631,7 +4629,6 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
  case AMDGPUISD::BUFFER_LOAD_USHORT:
    return 16;
  case AMDGPUISD::FP_TO_FP16:
  case AMDGPUISD::FP16_ZEXT:
    return 16;
  default:
    return 1;
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@ -457,9 +457,6 @@ enum NodeType : unsigned {
  // are known 0.
  FP_TO_FP16,
  // Wrapper around fp16 results that are known to zero the high bits.
  FP16_ZEXT,
  /// This node is for VLIW targets and it is used to represent a vector
  /// that is stored in consecutive registers with the same channel.
  /// For example:
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@ -132,7 +132,6 @@ def AMDGPUpknorm_u16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFP
 def AMDGPUpk_i16_i32_impl : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
 def AMDGPUpk_u16_u32_impl : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
 def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
 def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
 def AMDGPUfp_class_impl : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@ -9375,63 +9375,6 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
  return SDValue();
 }
 // Instructions that will be lowered with a final instruction that zeros the
 // high result bits.
 // XXX - probably only need to list legal operations.
 static bool fp16SrcZerosHighBits(unsigned Opc) {
  switch (Opc) {
  case ISD::FADD:
  case ISD::FSUB:
  case ISD::FMUL:
  case ISD::FDIV:
  case ISD::FREM:
  case ISD::FMA:
  case ISD::FMAD:
  case ISD::FCANONICALIZE:
  case ISD::FP_ROUND:
  case ISD::UINT_TO_FP:
  case ISD::SINT_TO_FP:
  case ISD::FABS:
    // Fabs is lowered to a bit operation, but it's an and which will clear the
    // high bits anyway.
  case ISD::FSQRT:
  case ISD::FSIN:
  case ISD::FCOS:
  case ISD::FPOWI:
  case ISD::FPOW:
  case ISD::FLOG:
  case ISD::FLOG2:
  case ISD::FLOG10:
  case ISD::FEXP:
  case ISD::FEXP2:
  case ISD::FCEIL:
  case ISD::FTRUNC:
  case ISD::FRINT:
  case ISD::FNEARBYINT:
  case ISD::FROUND:
  case ISD::FFLOOR:
  case ISD::FMINNUM:
  case ISD::FMAXNUM:
  case AMDGPUISD::FRACT:
  case AMDGPUISD::CLAMP:
  case AMDGPUISD::COS_HW:
  case AMDGPUISD::SIN_HW:
  case AMDGPUISD::FMIN3:
  case AMDGPUISD::FMAX3:
  case AMDGPUISD::FMED3:
  case AMDGPUISD::FMAD_FTZ:
  case AMDGPUISD::RCP:
  case AMDGPUISD::RSQ:
  case AMDGPUISD::RCP_IFLAG:
  case AMDGPUISD::LDEXP:
    return true;
  default:
    // fcopysign, select and others may be lowered to 32-bit bit operations
    // which don't zero the high bits.
    return false;
  }
 }
 SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
                                                   DAGCombinerInfo &DCI) const {
  if (!Subtarget->has16BitInsts() ||
@ -9446,15 +9389,6 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
  if (Src.getValueType() != MVT::i16)
    return SDValue();
  // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
  // FIXME: It is not universally true that the high bits are zeroed on gfx9.
  if (Src.getOpcode() == ISD::BITCAST) {
    SDValue BCSrc = Src.getOperand(0);
    if (BCSrc.getValueType() == MVT::f16 &&
        fp16SrcZerosHighBits(BCSrc.getOpcode()))
      return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
  }
  return SDValue();
 }
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@ -814,6 +814,12 @@ def NegSubInlineConstV216 : PatLeaf<(build_vector), [{
         (isNullConstantOrUndef(Src1) && isNegInlineImmediate(Src0.getNode()));
 }], getNegV2I16Imm>;
 def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{
  return fp16SrcZerosHighBits(N->getOpcode());
 }]>;
 //===----------------------------------------------------------------------===//
 // MUBUF/SMEM Patterns
 //===----------------------------------------------------------------------===//
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@ -1992,11 +1992,13 @@ def : GCNPat <
 //===----------------------------------------------------------------------===//
 // Miscellaneous Patterns
 //===----------------------------------------------------------------------===//
 def : GCNPat <
  (i32 (AMDGPUfp16_zext f16:$src)),
  (COPY $src)
 >;
 // Eliminate a zero extension from an fp16 operation if it already
 // zeros the high bits of the 32-bit register.
 def : GCNPat<
  (i32 (zext (i16 (bitconvert fp16_zeros_high_16bits:$src)))),
  (COPY VSrc_b16:$src)
 >;
 def : GCNPat <
  (i32 (trunc i64:$a)),