[PartiallyInlineLibCalls][x86] add TTI hook to allow sqrt inlining to depend on arg rather than result

This should fix PR31455: https://bugs.llvm.org/show_bug.cgi?id=31455 Differential Revision: https://reviews.llvm.org/D28314 llvm-svn: 319094
2025-01-31 12:41:49 +01:00 · 2017-11-27 21:15:43 +00:00 · 2017-11-27 21:15:43 +00:00 · 49d4f16628
commit 49d4f16628
parent af46cd40a3
9 changed files with 58 additions and 25 deletions
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@ -586,6 +586,12 @@ public:
  /// \brief Return true if the hardware has a fast square-root instruction.
  bool haveFastSqrt(Type *Ty) const;

+  /// Return true if it is faster to check if a floating-point value is NaN
+  /// (or not-NaN) versus a comparison against a constant FP zero value.
+  /// Targets should override this if materializing a 0.0 for comparison is
+  /// generally as cheap as checking for ordered/unordered.
+  bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) const;
+
  /// \brief Return the expected cost of supporting the floating point operation
  /// of the specified type.
  int getFPOpCost(Type *Ty) const;
@ -1009,6 +1015,7 @@ public:
                                              bool *Fast) = 0;
  virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0;
  virtual bool haveFastSqrt(Type *Ty) = 0;
+  virtual bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) = 0;
  virtual int getFPOpCost(Type *Ty) = 0;
  virtual int getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, const APInt &Imm,
                                    Type *Ty) = 0;
@ -1273,6 +1280,10 @@ public:
  }
  bool haveFastSqrt(Type *Ty) override { return Impl.haveFastSqrt(Ty); }

+  bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) override {
+    return Impl.isFCmpOrdCheaperThanFCmpZero(Ty);
+  }
+
  int getFPOpCost(Type *Ty) override { return Impl.getFPOpCost(Ty); }

  int getIntImmCodeSizeCost(unsigned Opc, unsigned Idx, const APInt &Imm,
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@ -316,6 +316,8 @@ public:

  bool haveFastSqrt(Type *Ty) { return false; }

+  bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) { return true; }
+  
  unsigned getFPOpCost(Type *Ty) { return TargetTransformInfo::TCC_Basic; }

  int getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@ -297,6 +297,10 @@ public:
           TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
  }

+  bool isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
+    return true;
+  }
+
  unsigned getFPOpCost(Type *Ty) {
    // By default, FP instructions are no more expensive since they are
    // implemented in HW.  Target specific TTI can override this.
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@ -281,6 +281,10 @@ bool TargetTransformInfo::haveFastSqrt(Type *Ty) const {
  return TTIImpl->haveFastSqrt(Ty);
 }

+bool TargetTransformInfo::isFCmpOrdCheaperThanFCmpZero(Type *Ty) const {
+  return TTIImpl->isFCmpOrdCheaperThanFCmpZero(Ty);
+}
+
 int TargetTransformInfo::getFPOpCost(Type *Ty) const {
  int Cost = TTIImpl->getFPOpCost(Ty);
  assert(Cost >= 0 && "TTI should not produce negative costs!");
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@ -2537,6 +2537,10 @@ bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
  return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
 }

+bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
+  return false;
+}
+
 bool X86TTIImpl::areInlineCompatible(const Function *Caller,
                                     const Function *Callee) const {
  const TargetMachine &TM = getTLI()->getTargetMachine();
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@ -125,6 +125,7 @@ public:
  bool isLegalMaskedGather(Type *DataType);
  bool isLegalMaskedScatter(Type *DataType);
  bool hasDivRemOp(Type *DataType, bool IsSigned);
+  bool isFCmpOrdCheaperThanFCmpZero(Type *Ty);
  bool areInlineCompatible(const Function *Caller,
                           const Function *Callee) const;
  const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(
--- a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@ -26,7 +26,8 @@ using namespace llvm;


 static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
-                         BasicBlock &CurrBB, Function::iterator &BB) {
+                         BasicBlock &CurrBB, Function::iterator &BB,
+                         const TargetTransformInfo *TTI) {
  // There is no need to change the IR, since backend will emit sqrt
  // instruction if the call has already been marked read-only.
  if (Call->onlyReadsMemory())
@ -39,7 +40,7 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
  //
  // (after)
  // v0 = sqrt_noreadmem(src) # native sqrt instruction.
-  // if (v0 is a NaN)
+  // [if (v0 is a NaN) || if (src < 0)]
  //   v1 = sqrt(src)         # library call.
  // dst = phi(v0, v1)
  //
@ -48,7 +49,8 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
  // Create phi and replace all uses.
  BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode());
  IRBuilder<> Builder(JoinBB, JoinBB->begin());
-  PHINode *Phi = Builder.CreatePHI(Call->getType(), 2);
+  Type *Ty = Call->getType();
+  PHINode *Phi = Builder.CreatePHI(Ty, 2);
  Call->replaceAllUsesWith(Phi);

  // Create basic block LibCallBB and insert a call to library function sqrt.
@ -65,7 +67,10 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
  Call->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
  CurrBB.getTerminator()->eraseFromParent();
  Builder.SetInsertPoint(&CurrBB);
-  Value *FCmp = Builder.CreateFCmpOEQ(Call, Call);
+  Value *FCmp = TTI->isFCmpOrdCheaperThanFCmpZero(Ty)
+                    ? Builder.CreateFCmpORD(Call, Call)
+                    : Builder.CreateFCmpOGE(Call->getOperand(0),
+                                            ConstantFP::get(Ty, 0.0));
  Builder.CreateCondBr(FCmp, JoinBB, LibCallBB);

  // Add phi operands.
@ -106,7 +111,7 @@ static bool runPartiallyInlineLibCalls(Function &F, TargetLibraryInfo *TLI,
      case LibFunc_sqrtf:
      case LibFunc_sqrt:
        if (TTI->haveFastSqrt(Call->getType()) &&
-            optimizeSQRT(Call, CalledFunc, *CurrBB, BB))
+            optimizeSQRT(Call, CalledFunc, *CurrBB, BB, TTI))
          break;
        continue;
      default:
--- a/test/CodeGen/X86/sqrt-partial.ll
+++ b/test/CodeGen/X86/sqrt-partial.ll
@ -3,7 +3,7 @@

 ; PR31455 - https://bugs.llvm.org/show_bug.cgi?id=31455
 ; We have to assume that errno can be set, so we have to make a libcall in that case.
-; But it's better for perf to check that the argument is valid rather than the result of 
+; But it's better for perf to check that the argument is valid rather than the result of
 ; sqrtss/sqrtsd.
 ; Note: This is really a test of the -partially-inline-libcalls IR pass (and we have an IR test
 ; for that), but we're checking the final asm to make sure that comes out as expected too.
@ -11,11 +11,11 @@
 define float @f(float %val) nounwind {
 ; CHECK-LABEL: f:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    sqrtss %xmm0, %xmm1
-; CHECK-NEXT:    ucomiss %xmm1, %xmm1
-; CHECK-NEXT:    jp .LBB0_2
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    ucomiss %xmm1, %xmm0
+; CHECK-NEXT:    jb .LBB0_2
 ; CHECK-NEXT:  # BB#1: # %.split
-; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    sqrtss %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB0_2: # %call.sqrt
 ; CHECK-NEXT:    jmp sqrtf # TAILCALL
@ -26,11 +26,11 @@ define float @f(float %val) nounwind {
 define double @d(double %val) nounwind {
 ; CHECK-LABEL: d:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    sqrtsd %xmm0, %xmm1
-; CHECK-NEXT:    ucomisd %xmm1, %xmm1
-; CHECK-NEXT:    jp .LBB1_2
+; CHECK-NEXT:    xorps %xmm1, %xmm1
+; CHECK-NEXT:    ucomisd %xmm1, %xmm0
+; CHECK-NEXT:    jb .LBB1_2
 ; CHECK-NEXT:  # BB#1: # %.split
-; CHECK-NEXT:    movapd %xmm1, %xmm0
+; CHECK-NEXT:    sqrtsd %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB1_2: # %call.sqrt
 ; CHECK-NEXT:    jmp sqrt # TAILCALL
--- a/test/Transforms/PartiallyInlineLibCalls/X86/good-prototype.ll
+++ b/test/Transforms/PartiallyInlineLibCalls/X86/good-prototype.ll
@ -1,18 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -partially-inline-libcalls -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
 ; RUN: opt -S -passes=partially-inline-libcalls -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s

 define float @f(float %val) {
-; CHECK: @f
-; CHECK: entry:
-; CHECK-NEXT: %[[RES:.+]] = tail call float @sqrtf(float %val) #0
-; CHECK-NEXT: %[[CMP:.+]] = fcmp oeq float %[[RES]], %[[RES]]
-; CHECK-NEXT: br i1 %[[CMP]], label %[[EXIT:.+]], label %[[CALL:.+]]
-; CHECK: [[CALL]]:
-; CHECK-NEXT: %[[RES2:.+]] = tail call float @sqrtf(float %val){{$}}
-; CHECK-NEXT: br label %[[EXIT]]
-; CHECK: [[EXIT]]:
-; CHECK-NEXT: %[[RET:.+]] = phi float [ %[[RES]], %entry ], [ %[[RES2]], %[[CALL]] ]
-; CHECK-NEXT: ret float %[[RET]]
+; CHECK-LABEL: @f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[RES:%.*]] = tail call float @sqrtf(float [[VAL:%.*]]) #0
+; CHECK-NEXT:    [[TMP0:%.*]] = fcmp oge float [[VAL]], 0.000000e+00
+; CHECK-NEXT:    br i1 [[TMP0]], label [[ENTRY_SPLIT:%.*]], label [[CALL_SQRT:%.*]]
+; CHECK:       call.sqrt:
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call float @sqrtf(float [[VAL]])
+; CHECK-NEXT:    br label [[ENTRY_SPLIT]]
+; CHECK:       entry.split:
+; CHECK-NEXT:    [[TMP2:%.*]] = phi float [ [[RES]], [[ENTRY:%.*]] ], [ [[TMP1]], [[CALL_SQRT]] ]
+; CHECK-NEXT:    ret float [[TMP2]]
+;
 entry:
  %res = tail call float @sqrtf(float %val)
  ret float %res