[TTI] Add TargetCostKind argument to getUserCost

There are several different types of cost that TTI tries to provide explicit information for: throughput, latency, code size along with a vague 'intersection of code-size cost and execution cost'. The vectorizer is a keen user of RecipThroughput and there's at least 'getInstructionThroughput' and 'getArithmeticInstrCost' designed to help with this cost. The latency cost has a single use and a single implementation. The intersection cost appears to cover most of the rest of the API. getUserCost is explicitly called from within TTI when the user has been explicit in wanting the code size (also only one use) as well as a few passes which are concerned with a mixture of size and/or a relative cost. In many cases these costs are closely related, such as when multiple instructions are required, but one evident diverging cost in this function is for div/rem. This patch adds an argument so that the cost required is explicit, so that we can make the important distinction when necessary. Differential Revision: https://reviews.llvm.org/D78635
2025-02-01 05:01:59 +01:00 · 2020-04-27 09:02:14 +01:00 · 2020-04-27 09:02:14 +01:00 · d44080f77a
commit d44080f77a
parent beb72eda94
21 changed files with 76 additions and 50 deletions
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@ -153,7 +153,8 @@ public:
  enum TargetCostKind {
    TCK_RecipThroughput, ///< Reciprocal throughput.
    TCK_Latency,         ///< The latency of instruction.
-    TCK_CodeSize         ///< Instruction code size.
+    TCK_CodeSize,        ///< Instruction code size.
+    TCK_SizeAndLatency   ///< The weighted sum of size and latency.
  };

  /// Query the cost of a specified instruction.
@ -172,7 +173,8 @@ public:
      return getInstructionLatency(I);

    case TCK_CodeSize:
-      return getUserCost(I);
+    case TCK_SizeAndLatency:
+      return getUserCost(I, kind);
    }
    llvm_unreachable("Unknown instruction cost kind");
  }
@ -263,14 +265,15 @@ public:
  ///
  /// The returned cost is defined in terms of \c TargetCostConstants, see its
  /// comments for a detailed explanation of the cost values.
-  int getUserCost(const User *U, ArrayRef<const Value *> Operands) const;
+  int getUserCost(const User *U, ArrayRef<const Value *> Operands,
+                  TargetCostKind CostKind) const;

  /// This is a helper function which calls the two-argument getUserCost
  /// with \p Operands which are the current operands U has.
-  int getUserCost(const User *U) const {
+  int getUserCost(const User *U, TargetCostKind CostKind) const {
    SmallVector<const Value *, 4> Operands(U->value_op_begin(),
                                           U->value_op_end());
-    return getUserCost(U, Operands);
+    return getUserCost(U, Operands, CostKind);
  }

  /// Return true if branch divergence exists.
@ -1170,7 +1173,8 @@ public:
  getEstimatedNumberOfCaseClusters(const SwitchInst &SI, unsigned &JTSize,
                                   ProfileSummaryInfo *PSI,
                                   BlockFrequencyInfo *BFI) = 0;
-  virtual int getUserCost(const User *U, ArrayRef<const Value *> Operands) = 0;
+  virtual int getUserCost(const User *U, ArrayRef<const Value *> Operands,
+                          TargetCostKind CostKind) = 0;
  virtual bool hasBranchDivergence() = 0;
  virtual bool useGPUDivergenceAnalysis() = 0;
  virtual bool isSourceOfDivergence(const Value *V) = 0;
@ -1422,8 +1426,9 @@ public:
  int getMemcpyCost(const Instruction *I) override {
    return Impl.getMemcpyCost(I);
  }
-  int getUserCost(const User *U, ArrayRef<const Value *> Operands) override {
-    return Impl.getUserCost(U, Operands);
+  int getUserCost(const User *U, ArrayRef<const Value *> Operands,
+                  TargetCostKind CostKind) override {
+    return Impl.getUserCost(U, Operands, CostKind);
  }
  bool hasBranchDivergence() override { return Impl.hasBranchDivergence(); }
  bool useGPUDivergenceAnalysis() override {
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@ -792,9 +792,11 @@ public:
    return static_cast<T *>(this)->getIntrinsicCost(IID, RetTy, ParamTys, U);
  }

-  unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands) {
+  unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands,
+                       enum TTI::TargetCostKind CostKind) {
    auto *TargetTTI = static_cast<T *>(this);

+    // FIXME: Unlikely to be true for anything but CodeSize.
    if (const auto *CB = dyn_cast<CallBase>(U)) {
      const Function *F = CB->getCalledFunction();
      if (F) {
@ -841,6 +843,7 @@ public:
    case Instruction::SRem:
    case Instruction::UDiv:
    case Instruction::URem:
+      // FIXME: Unlikely to be true for CodeSize.
      return TTI::TCC_Expensive;
    case Instruction::IntToPtr:
    case Instruction::PtrToInt:
@ -867,7 +870,7 @@ public:
  int getInstructionLatency(const Instruction *I) {
    SmallVector<const Value *, 4> Operands(I->value_op_begin(),
                                           I->value_op_end());
-    if (getUserCost(I, Operands) == TTI::TCC_Free)
+    if (getUserCost(I, Operands, TTI::TCK_Latency) == TTI::TCC_Free)
      return 0;

    if (isa<LoadInst>(I))
--- a/lib/Analysis/CodeMetrics.cpp
+++ b/lib/Analysis/CodeMetrics.cpp
@ -172,7 +172,7 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
      if (InvI->cannotDuplicate())
        notDuplicatable = true;

-    NumInsts += TTI.getUserCost(&I);
+    NumInsts += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize);
  }

  if (isa<ReturnInst>(BB->getTerminator()))
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@ -803,7 +803,8 @@ bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
      Operands.push_back(SimpleOp);
    else
      Operands.push_back(*I);
-  return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands);
+  return TargetTransformInfo::TCC_Free ==
+         TTI.getUserCost(&GEP, Operands, TargetTransformInfo::TCK_SizeAndLatency);
 }

 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
@ -1051,7 +1052,8 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
  if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
    SROAArgValues[&I] = SROAArg;

-  return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
+  return TargetTransformInfo::TCC_Free ==
+         TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
 }

 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
@ -1075,7 +1077,8 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
  if (auto *SROAArg = getSROAArgForValueOrNull(Op))
    SROAArgValues[&I] = SROAArg;

-  return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
+  return TargetTransformInfo::TCC_Free ==
+         TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
 }

 bool CallAnalyzer::visitCastInst(CastInst &I) {
@ -1105,7 +1108,8 @@ bool CallAnalyzer::visitCastInst(CastInst &I) {
    break;
  }

-  return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
+  return TargetTransformInfo::TCC_Free ==
+         TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
 }

 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
@ -1807,7 +1811,8 @@ bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
 bool CallAnalyzer::visitInstruction(Instruction &I) {
  // Some instructions are free. All of the free intrinsics can also be
  // handled by SROA, etc.
-  if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
+  if (TargetTransformInfo::TCC_Free ==
+      TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency))
    return true;

  // We found something we don't understand or can't handle. Mark any SROA-able
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@ -178,8 +178,9 @@ unsigned TargetTransformInfo::getEstimatedNumberOfCaseClusters(
 }

 int TargetTransformInfo::getUserCost(const User *U,
-                                     ArrayRef<const Value *> Operands) const {
-  int Cost = TTIImpl->getUserCost(U, Operands);
+                                     ArrayRef<const Value *> Operands,
+                                     enum TargetCostKind CostKind) const {
+  int Cost = TTIImpl->getUserCost(U, Operands, CostKind);
  assert(Cost >= 0 && "TTI should not produce negative costs!");
  return Cost;
 }
@ -1152,7 +1153,7 @@ matchVectorSplittingReduction(const ExtractElementInst *ReduxRoot,
 int TargetTransformInfo::getInstructionThroughput(const Instruction *I) const {
  switch (I->getOpcode()) {
  case Instruction::GetElementPtr:
-    return getUserCost(I);
+    return getUserCost(I, TCK_RecipThroughput);

  case Instruction::Ret:
  case Instruction::PHI:
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@ -6103,7 +6103,8 @@ static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) {
  // If it's safe to speculatively execute, then it should not have side
  // effects; therefore, it's safe to sink and possibly *not* execute.
  return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) &&
-         TTI->getUserCost(I) >= TargetTransformInfo::TCC_Expensive;
+         TTI->getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency) >=
+         TargetTransformInfo::TCC_Expensive;
 }

 /// Returns true if a SelectInst should be turned into an explicit branch.
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@ -949,11 +949,12 @@ void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
  CommonTTI.getUnrollingPreferences(L, SE, UP);
 }

-unsigned GCNTTIImpl::getUserCost(const User *U,
-                                 ArrayRef<const Value *> Operands) {
+unsigned
+GCNTTIImpl::getUserCost(const User *U, ArrayRef<const Value *> Operands,
+                        TTI::TargetCostKind CostKind) {
  const Instruction *I = dyn_cast<Instruction>(U);
  if (!I)
-    return BaseT::getUserCost(U, Operands);
+    return BaseT::getUserCost(U, Operands, CostKind);

  // Estimate different operations to be optimized out
  switch (I->getOpcode()) {
@ -980,7 +981,7 @@ unsigned GCNTTIImpl::getUserCost(const User *U,
      return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args,
                                   FMF, 1, II);
    } else {
-      return BaseT::getUserCost(U, Operands);
+      return BaseT::getUserCost(U, Operands, CostKind);
    }
  }
  case Instruction::ShuffleVector: {
@ -994,7 +995,7 @@ unsigned GCNTTIImpl::getUserCost(const User *U,
      return getShuffleCost(TTI::SK_ExtractSubvector, SrcTy, SubIndex, Ty);

    if (Shuffle->changesLength())
-      return BaseT::getUserCost(U, Operands);
+      return BaseT::getUserCost(U, Operands, CostKind);

    if (Shuffle->isIdentity())
      return 0;
@ -1059,7 +1060,7 @@ unsigned GCNTTIImpl::getUserCost(const User *U,
    break;
  }

-  return BaseT::getUserCost(U, Operands);
+  return BaseT::getUserCost(U, Operands, CostKind);
 }

 unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@ -242,7 +242,8 @@ public:
  int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
                             bool IsPairwiseForm,
                             bool IsUnsigned);
-  unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
+  unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands,
+                       TTI::TargetCostKind CostKind);
 };

 class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@ -1362,7 +1362,7 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,

      SmallVector<const Value*, 4> Operands(I.value_op_begin(),
                                            I.value_op_end());
-      Cost += getUserCost(&I, Operands);
+      Cost += getUserCost(&I, Operands, TargetTransformInfo::TCK_CodeSize);
    }
  }

--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@ -298,8 +298,10 @@ unsigned HexagonTTIImpl::getCacheLineSize() const {
  return ST.getL1CacheLineSize();
 }

-int HexagonTTIImpl::getUserCost(const User *U,
-                                ArrayRef<const Value *> Operands) {
+int
+HexagonTTIImpl::getUserCost(const User *U,
+                            ArrayRef<const Value *> Operands,
+                            TTI::TargetCostKind CostKind) {
  auto isCastFoldedIntoLoad = [this](const CastInst *CI) -> bool {
    if (!CI->isIntegerCast())
      return false;
@ -321,7 +323,7 @@ int HexagonTTIImpl::getUserCost(const User *U,
  if (const CastInst *CI = dyn_cast<const CastInst>(U))
    if (isCastFoldedIntoLoad(CI))
      return TargetTransformInfo::TCC_Free;
-  return BaseT::getUserCost(U, Operands);
+  return BaseT::getUserCost(U, Operands, CostKind);
 }

 bool HexagonTTIImpl::shouldBuildLookupTables() const {
--- a/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@ -148,7 +148,8 @@ public:

  /// @}

-  int getUserCost(const User *U, ArrayRef<const Value *> Operands);
+  int getUserCost(const User *U, ArrayRef<const Value *> Operands,
+                  TTI::TargetCostKind CostKind);

  // Hexagon specific decision to generate a lookup table.
  bool shouldBuildLookupTables() const;
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@ -206,15 +206,16 @@ int PPCTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
  return PPCTTIImpl::getIntImmCost(Imm, Ty);
 }

-unsigned PPCTTIImpl::getUserCost(const User *U,
-                                 ArrayRef<const Value *> Operands) {
+unsigned
+PPCTTIImpl::getUserCost(const User *U, ArrayRef<const Value *> Operands,
+                        TTI::TargetCostKind CostKind) {
  if (U->getType()->isVectorTy()) {
    // Instructions that need to be split should cost more.
    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, U->getType());
-    return LT.first * BaseT::getUserCost(U, Operands);
+    return LT.first * BaseT::getUserCost(U, Operands, CostKind);
  }

-  return BaseT::getUserCost(U, Operands);
+  return BaseT::getUserCost(U, Operands, CostKind);
 }

 bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo,
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@ -52,7 +52,8 @@ public:
  int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
                          Type *Ty);

-  unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
+  unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands,
+                       TTI::TargetCostKind CostKind);

  TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
  bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@ -3641,8 +3641,9 @@ int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
  return X86TTIImpl::getIntImmCost(Imm, Ty);
 }

-unsigned X86TTIImpl::getUserCost(const User *U,
-                                 ArrayRef<const Value *> Operands) {
+unsigned
+X86TTIImpl::getUserCost(const User *U, ArrayRef<const Value *> Operands,
+                        TTI::TargetCostKind CostKind) {
  if (isa<StoreInst>(U)) {
    Value *Ptr = U->getOperand(1);
    // Store instruction with index and scale costs 2 Uops.
@ -3653,7 +3654,7 @@ unsigned X86TTIImpl::getUserCost(const User *U,
    }
    return TTI::TCC_Basic;
  }
-  return BaseT::getUserCost(U, Operands);
+  return BaseT::getUserCost(U, Operands, CostKind);
 }

 // Return an average cost of Gather / Scatter instruction, maybe improved later
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@ -181,7 +181,8 @@ public:

  int getIntImmCost(const APInt &Imm, Type *Ty);

-  unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
+  unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands,
+                       TTI::TargetCostKind);

  int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
  int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@ -1261,7 +1261,8 @@ static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop,
                         const TargetTransformInfo *TTI) {

  if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I)) {
-    if (TTI->getUserCost(GEP) != TargetTransformInfo::TCC_Free)
+    if (TTI->getUserCost(GEP, TargetTransformInfo::TCK_SizeAndLatency) !=
+        TargetTransformInfo::TCC_Free)
      return false;
    // For a GEP, we cannot simply use getUserCost because currently it
    // optimistically assume that a GEP will fold into addressing mode
@ -1276,7 +1277,8 @@ static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop,
    }
    return true;
  } else
-    return TTI->getUserCost(&I) == TargetTransformInfo::TCC_Free;
+    return TTI->getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
+           TargetTransformInfo::TCC_Free;
 }

 /// Return true if the only users of this instruction are outside of
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@ -446,7 +446,7 @@ static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(

        // First accumulate the cost of this instruction.
        if (!Cost.IsFree) {
-          UnrolledCost += TTI.getUserCost(I);
+          UnrolledCost += TTI.getUserCost(I, TargetTransformInfo::TCK_CodeSize);
          LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "
                            << Iteration << "): ");
          LLVM_DEBUG(I->dump());
@ -539,7 +539,7 @@ static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(

        // Track this instruction's expected baseline cost when executing the
        // rolled loop form.
-        RolledDynamicCost += TTI.getUserCost(&I);
+        RolledDynamicCost += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize);

        // Visit the instruction to analyze its loop cost after unrolling,
        // and if the visitor returns true, mark the instruction as free after
--- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@ -2660,7 +2660,7 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
        if (CB->isConvergent() || CB->cannotDuplicate())
          return false;

-      Cost += TTI.getUserCost(&I);
+      Cost += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize);
    }
    assert(Cost >= 0 && "Must not have negative costs!");
    LoopCost += Cost;
--- a/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
+++ b/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
@ -465,7 +465,7 @@ findProfitablePHIs(ArrayRef<PHINode *> PNs,
            if (CostMapIt != SpecCostMap.end())
              Cost += CostMapIt->second;
          }
-        Cost += TTI.getUserCost(I);
+        Cost += TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency);
        bool Inserted = SpecCostMap.insert({I, Cost}).second;
        (void)Inserted;
        assert(Inserted && "Must not re-insert a cost during the DFS!");
--- a/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/lib/Transforms/Scalar/SpeculativeExecution.cpp
@ -244,7 +244,7 @@ static unsigned ComputeSpeculationCost(const Instruction *I,
    case Instruction::FNeg:
    case Instruction::ICmp:
    case Instruction::FCmp:
-      return TTI.getUserCost(I);
+      return TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency);

    default:
      return UINT_MAX; // Disallow anything not whitelisted.
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@ -330,7 +330,7 @@ static unsigned ComputeSpeculationCost(const User *I,
                                       const TargetTransformInfo &TTI) {
  assert(isSafeToSpeculativelyExecute(I) &&
         "Instruction is not safe to speculatively execute!");
-  return TTI.getUserCost(I);
+  return TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency);
 }

 /// If we have a merge point of an "if condition" as accepted above,
@ -3045,7 +3045,7 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
        return false; // Not in white-list - not worthwhile folding.
      // And finally, if this is a non-free instruction that we are okay
      // speculating, ensure that we consider the speculation budget.
-      BudgetRemaining -= TTI.getUserCost(&I);
+      BudgetRemaining -= TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
      if (BudgetRemaining < 0)
        return false; // Eagerly refuse to fold as soon as we're out of budget.
    }