AMDGPU: Address todo for handling 1/(2 pi)

llvm-svn: 339814
2025-01-31 20:51:52 +01:00 · 2018-08-15 21:03:55 +00:00 · 2018-08-15 21:03:55 +00:00 · d8fe316d41
commit d8fe316d41
parent cd6d7bb841
6 changed files with 384 additions and 164 deletions
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@ -3449,9 +3449,27 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
  return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
 }

-static bool isConstantFPZero(SDValue N) {
-  if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
-    return C->isZero() && !C->isNegative();
+static bool isInv2Pi(const APFloat &APF) {
+  static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
+  static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
+  static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
+
+  return APF.bitwiseIsEqual(KF16) ||
+         APF.bitwiseIsEqual(KF32) ||
+         APF.bitwiseIsEqual(KF64);
+}
+
+// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
+// additional cost to negate them.
+bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
+  if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
+    if (C->isZero() && !C->isNegative())
+      return true;
+
+    if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
+      return true;
+  }
+
  return false;
 }

@ -3577,9 +3595,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
    SDValue RHS = N0.getOperand(1);

    // 0 doesn't have a negated inline immediate.
-    // TODO: Shouldn't fold 1/2pi either, and should be generalized to other
-    // operations.
-    if (isConstantFPZero(RHS))
+    // TODO: This constant check should be generalized to other operations.
+    if (isConstantCostlierToNegate(RHS))
      return SDValue();

    SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@ -95,6 +95,8 @@ protected:
  SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
                             SDValue RHS, DAGCombinerInfo &DCI) const;
  SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
+  bool isConstantCostlierToNegate(SDValue N) const;
  SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
  SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
  SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@ -136,6 +136,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT,
  HasVOP3PInsts(false),
  HasMulI24(true),
  HasMulU24(true),
+  HasInv2PiInlineImm(false),
  HasFminFmaxLegacy(true),
  EnablePromoteAlloca(false),
  LocalMemorySize(0),
@ -190,7 +191,6 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
    HasVGPRIndexMode(false),
    HasScalarStores(false),
    HasScalarAtomics(false),
-    HasInv2PiInlineImm(false),
    HasSDWAOmod(false),
    HasSDWAScalar(false),
    HasSDWASdst(false),
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@ -72,6 +72,7 @@ protected:
  bool HasVOP3PInsts;
  bool HasMulI24;
  bool HasMulU24;
+  bool HasInv2PiInlineImm;
  bool HasFminFmaxLegacy;
  bool EnablePromoteAlloca;
  int LocalMemorySize;
@ -170,6 +171,10 @@ public:
    return HasMulU24;
  }

+  bool hasInv2PiInlineImm() const {
+    return HasInv2PiInlineImm;
+  }
+
  bool hasFminFmaxLegacy() const {
    return HasFminFmaxLegacy;
  }
@ -347,7 +352,6 @@ protected:
  bool HasVGPRIndexMode;
  bool HasScalarStores;
  bool HasScalarAtomics;
-  bool HasInv2PiInlineImm;
  bool HasSDWAOmod;
  bool HasSDWAScalar;
  bool HasSDWASdst;
@ -782,9 +786,6 @@ public:
    return HasScalarAtomics;
  }

-  bool hasInv2PiInlineImm() const {
-    return HasInv2PiInlineImm;
-  }

  bool hasDPP() const {
    return HasDPP;
--- a/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/test/CodeGen/AMDGPU/fneg-combines.ll
--- a/test/CodeGen/AMDGPU/fneg-combines.si.ll
+++ b/test/CodeGen/AMDGPU/fneg-combines.si.ll
@ -0,0 +1,28 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=tahiti -start-after=sink -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=SI -check-prefix=FUNC %s
+
+; --------------------------------------------------------------------------------
+; rcp_legacy tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_rcp_legacy_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_rcp_legacy_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
+; GCN: {{buffer|flat}}_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %rcp = call float @llvm.amdgcn.rcp.legacy(float %a)
+  %fneg = fsub float -0.000000e+00, %rcp
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare float @llvm.amdgcn.rcp.legacy(float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }