From 0f4c2ea58a3e6dd461678c0f4a5964c318837944 Mon Sep 17 00:00:00 2001 From: David Green Date: Tue, 23 Feb 2021 13:04:59 +0000 Subject: [PATCH] [TTI] Change getOperandsScalarizationOverhead to take Type args As a followup to D95291, getOperandsScalarizationOverhead was still using a VF as a vector factor if the arguments were scalar, and would assert on certain matrix intrinsics with differently sized vector arguments. This patch removes the VF arg, instead passing the Types through directly. This should allow it to more accurately compute the cost without having to guess at which operands will be vectorized, something difficult with more complex intrinsics. This adjusts one SVE test as it is now calling the wrong intrinsic vs veccall. Without invalid InstructCosts the cost of the scalarized intrinsic is too low. This should get fixed when the cost of scalarization is accounted for with scalable types. Differential Revision: https://reviews.llvm.org/D96287 --- include/llvm/Analysis/TargetTransformInfo.h | 12 ++--- .../llvm/Analysis/TargetTransformInfoImpl.h | 2 +- include/llvm/CodeGen/BasicTTIImpl.h | 50 +++++++++---------- lib/Analysis/TargetTransformInfo.cpp | 4 +- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 6 ++- lib/Target/ARM/ARMTargetTransformInfo.cpp | 3 +- .../Hexagon/HexagonTargetTransformInfo.cpp | 7 +-- .../Hexagon/HexagonTargetTransformInfo.h | 2 +- .../SystemZ/SystemZTargetTransformInfo.cpp | 14 ++++-- lib/Transforms/Vectorize/LoopVectorize.cpp | 17 ++++--- test/Analysis/CostModel/PowerPC/matrix.ll | 22 ++++++++ .../LoopVectorize/AArch64/scalable-call.ll | 4 +- 12 files changed, 89 insertions(+), 54 deletions(-) create mode 100644 test/Analysis/CostModel/PowerPC/matrix.ll diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index 255a02e7ffd..8205df794a3 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -726,10 +726,10 @@ public: bool Insert, bool Extract) const; /// Estimate the overhead of scalarizing an instructions unique - /// non-constant operands. The types of the arguments are ordinarily - /// scalar, in which case the costs are multiplied with VF. + /// non-constant operands. The (potentially vector) types to use for each of + /// argument are passes via Tys. unsigned getOperandsScalarizationOverhead(ArrayRef Args, - unsigned VF) const; + ArrayRef Tys) const; /// If target has efficient vector element load/store instructions, it can /// return true here so that insertion/extraction costs are not added to @@ -1479,7 +1479,7 @@ public: bool Insert, bool Extract) = 0; virtual unsigned getOperandsScalarizationOverhead(ArrayRef Args, - unsigned VF) = 0; + ArrayRef Tys) = 0; virtual bool supportsEfficientVectorElementLoadStore() = 0; virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0; virtual MemCmpExpansionOptions @@ -1864,8 +1864,8 @@ public: return Impl.getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); } unsigned getOperandsScalarizationOverhead(ArrayRef Args, - unsigned VF) override { - return Impl.getOperandsScalarizationOverhead(Args, VF); + ArrayRef Tys) override { + return Impl.getOperandsScalarizationOverhead(Args, Tys); } bool supportsEfficientVectorElementLoadStore() override { diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h index 97259bb1e61..83b65760cef 100644 --- a/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -289,7 +289,7 @@ public: } unsigned getOperandsScalarizationOverhead(ArrayRef Args, - unsigned VF) const { + ArrayRef Tys) const { return 0; } diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h index c2b1a7a7f83..b7468e83112 100644 --- a/include/llvm/CodeGen/BasicTTIImpl.h +++ b/include/llvm/CodeGen/BasicTTIImpl.h @@ -609,51 +609,48 @@ public: return thisT()->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); } - /// Estimate the overhead of scalarizing an instruction's unique - /// non-constant operands. The types of the arguments are ordinarily - /// scalar, in which case the costs are multiplied with VF. + /// Estimate the overhead of scalarizing an instructions unique + /// non-constant operands. The (potentially vector) types to use for each of + /// argument are passes via Tys. unsigned getOperandsScalarizationOverhead(ArrayRef Args, - unsigned VF) { + ArrayRef Tys) { + assert(Args.size() == Tys.size() && "Expected matching Args and Tys"); + unsigned Cost = 0; SmallPtrSet UniqueOperands; - for (const Value *A : Args) { + for (int I = 0, E = Args.size(); I != E; I++) { // Disregard things like metadata arguments. - Type *Ty = A->getType(); + const Value *A = Args[I]; + Type *Ty = Tys[I]; if (!Ty->isIntOrIntVectorTy() && !Ty->isFPOrFPVectorTy() && !Ty->isPtrOrPtrVectorTy()) continue; if (!isa(A) && UniqueOperands.insert(A).second) { - auto *VecTy = dyn_cast(Ty); - if (VecTy) { - // If A is a vector operand, VF should be 1 or correspond to A. - assert((VF == 1 || - VF == cast(VecTy)->getNumElements()) && - "Vector argument does not match VF"); - } - else - VecTy = FixedVectorType::get(Ty, VF); - - Cost += getScalarizationOverhead(VecTy, false, true); + if (auto *VecTy = dyn_cast(Ty)) + Cost += getScalarizationOverhead(VecTy, false, true); } } return Cost; } - unsigned getScalarizationOverhead(VectorType *InTy, - ArrayRef Args) { - auto *Ty = cast(InTy); - + /// Estimate the overhead of scalarizing the inputs and outputs of an + /// instruction, with return type RetTy and arguments Args of type Tys. If + /// Args are unknown (empty), then the cost associated with one argument is + /// added as a heuristic. + unsigned getScalarizationOverhead(VectorType *RetTy, + ArrayRef Args, + ArrayRef Tys) { unsigned Cost = 0; - Cost += getScalarizationOverhead(Ty, true, false); + Cost += getScalarizationOverhead(RetTy, true, false); if (!Args.empty()) - Cost += getOperandsScalarizationOverhead(Args, Ty->getNumElements()); + Cost += getOperandsScalarizationOverhead(Args, Tys); else // When no information on arguments is provided, we add the cost // associated with one argument as a heuristic. - Cost += getScalarizationOverhead(Ty, false, true); + Cost += getScalarizationOverhead(RetTy, false, true); return Cost; } @@ -710,7 +707,8 @@ public: Opd1PropInfo, Opd2PropInfo, Args, CxtI); // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. - return getScalarizationOverhead(VTy, Args) + Num * Cost; + SmallVector Tys(Args.size(), Ty); + return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost; } // We don't know anything about this scalar instruction. @@ -1354,7 +1352,7 @@ public: ScalarizationCost += getScalarizationOverhead(cast(RetTy), true, false); ScalarizationCost += - getOperandsScalarizationOverhead(Args, RetVF.getKnownMinValue()); + getOperandsScalarizationOverhead(Args, ICA.getArgTypes()); } IntrinsicCostAttributes Attrs(IID, RetTy, ICA.getArgTypes(), FMF, I, diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index ac2e5c5c2aa..0b2fdcbca40 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -469,8 +469,8 @@ TargetTransformInfo::getScalarizationOverhead(VectorType *Ty, } unsigned TargetTransformInfo::getOperandsScalarizationOverhead( - ArrayRef Args, unsigned VF) const { - return TTIImpl->getOperandsScalarizationOverhead(Args, VF); + ArrayRef Args, ArrayRef Tys) const { + return TTIImpl->getOperandsScalarizationOverhead(Args, Tys); } bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const { diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 2b0f9b5599d..eb2733ff031 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -549,7 +549,8 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, Opd1PropInfo, Opd2PropInfo, Args, CxtI); // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. - return getScalarizationOverhead(VTy, Args) + Num * Cost; + SmallVector Tys(Args.size(), Ty); + return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost; } // We don't know anything about this scalar instruction. @@ -752,7 +753,8 @@ int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, if (!RetTy->isVoidTy()) ScalarizationCost += getScalarizationOverhead(cast(RetTy), true, false); - ScalarizationCost += getOperandsScalarizationOverhead(Args, RetVF); + ScalarizationCost += + getOperandsScalarizationOverhead(Args, ICA.getArgTypes()); } IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I, diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index 023c44d4f14..4b403c83037 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1363,7 +1363,8 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty, CostKind); // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. - return BaseT::getScalarizationOverhead(VTy, Args) + Num * Cost; + SmallVector Tys(Args.size(), Ty); + return BaseT::getScalarizationOverhead(VTy, Args, Tys) + Num * Cost; } return BaseCost; diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp index c6ef63b2aa3..063a5571c13 100644 --- a/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp +++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp @@ -118,9 +118,10 @@ unsigned HexagonTTIImpl::getScalarizationOverhead(VectorType *Ty, return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract); } -unsigned HexagonTTIImpl::getOperandsScalarizationOverhead( - ArrayRef Args, unsigned VF) { - return BaseT::getOperandsScalarizationOverhead(Args, VF); +unsigned +HexagonTTIImpl::getOperandsScalarizationOverhead(ArrayRef Args, + ArrayRef Tys) { + return BaseT::getOperandsScalarizationOverhead(Args, Tys); } unsigned HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy, diff --git a/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/lib/Target/Hexagon/HexagonTargetTransformInfo.h index 10b31088e45..61d50b9457a 100644 --- a/lib/Target/Hexagon/HexagonTargetTransformInfo.h +++ b/lib/Target/Hexagon/HexagonTargetTransformInfo.h @@ -107,7 +107,7 @@ public: unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract); unsigned getOperandsScalarizationOverhead(ArrayRef Args, - unsigned VF); + ArrayRef Tys); unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef Tys, TTI::TargetCostKind CostKind); unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index e7ac2391512..535f164baf2 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -487,8 +487,10 @@ int SystemZTTIImpl::getArithmeticInstrCost( if (DivRemConstPow2) return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1)); - if (DivRemConst) - return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args); + if (DivRemConst) { + SmallVector Tys(Args.size(), Ty); + return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args, Tys); + } if ((SignedDivRem || UnsignedDivRem) && VF > 4) // Temporary hack: disable high vectorization factors with integer // division/remainder, which will get scalarized and handled with @@ -511,7 +513,9 @@ int SystemZTTIImpl::getArithmeticInstrCost( // inserting and extracting the values. unsigned ScalarCost = getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind); - unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(VTy, Args); + SmallVector Tys(Args.size(), Ty); + unsigned Cost = + (VF * ScalarCost) + getScalarizationOverhead(VTy, Args, Tys); // FIXME: VF 2 for these FP operations are currently just as // expensive as for VF 4. if (VF == 2) @@ -528,7 +532,9 @@ int SystemZTTIImpl::getArithmeticInstrCost( // There is no native support for FRem. if (Opcode == Instruction::FRem) { - unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args); + SmallVector Tys(Args.size(), Ty); + unsigned Cost = + (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args, Tys); // FIXME: VF 2 for float is currently just as expensive as for VF 4. if (VF == 2 && ScalarBits == 32) Cost *= 2; diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 762302b779f..cebb8ece282 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3634,15 +3634,15 @@ LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, return Cost; } +static Type *MaybeVectorizeType(Type *Elt, ElementCount VF) { + if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) + return Elt; + return VectorType::get(Elt, VF); +} + InstructionCost LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI, ElementCount VF) { - auto MaybeVectorizeType = [](Type *Elt, ElementCount VF) -> Type * { - if (VF.isScalar() || (!Elt->isIntOrPtrTy() && !Elt->isFloatingPointTy())) - return Elt; - return VectorType::get(Elt, VF); - }; - Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI); assert(ID && "Expected intrinsic call!"); Type *RetTy = MaybeVectorizeType(CI->getType(), VF); @@ -6914,8 +6914,11 @@ LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, // Skip operands that do not require extraction/scalarization and do not incur // any overhead. + SmallVector Tys; + for (auto *V : filterExtractingOperands(Ops, VF)) + Tys.push_back(MaybeVectorizeType(V->getType(), VF)); return Cost + TTI.getOperandsScalarizationOverhead( - filterExtractingOperands(Ops, VF), VF.getKnownMinValue()); + filterExtractingOperands(Ops, VF), Tys); } void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { diff --git a/test/Analysis/CostModel/PowerPC/matrix.ll b/test/Analysis/CostModel/PowerPC/matrix.ll new file mode 100644 index 00000000000..4f0416f7c73 --- /dev/null +++ b/test/Analysis/CostModel/PowerPC/matrix.ll @@ -0,0 +1,22 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -mattr=+vsx | FileCheck %s +target datalayout = "E-m:e-i64:64-n32:64" +target triple = "powerpc64-unknown-linux-gnu" + +; This test checks we don't crash on certain matrix operations, more than +; checks the cost of the intrinsics per-se. + +define void @matrix() { +; CHECK-LABEL: 'matrix' +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %matrix1 = call <1 x i32> @llvm.matrix.column.major.load.v1i32(i32* nonnull align 4 undef, i64 1, i1 false, i32 1, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 452 for instruction: %0 = call <10 x i32> @llvm.matrix.multiply.v10i32.v10i32.v1i32(<10 x i32> undef, <1 x i32> %matrix1, i32 10, i32 1, i32 1) +; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void +; +entry: + %matrix1 = call <1 x i32> @llvm.matrix.column.major.load.v1i32(i32* nonnull align 4 undef, i64 1, i1 false, i32 1, i32 1) + %0 = call <10 x i32> @llvm.matrix.multiply.v10i32.v10i32.v1i32(<10 x i32> undef, <1 x i32> %matrix1, i32 10, i32 1, i32 1) + ret void +} + +declare <1 x i32> @llvm.matrix.column.major.load.v1i32(i32* nocapture, i64, i1 immarg, i32 immarg, i32 immarg) #2 +declare <10 x i32> @llvm.matrix.multiply.v10i32.v10i32.v1i32(<10 x i32>, <1 x i32>, i32 immarg, i32 immarg, i32 immarg) #3 diff --git a/test/Transforms/LoopVectorize/AArch64/scalable-call.ll b/test/Transforms/LoopVectorize/AArch64/scalable-call.ll index 8302dd2e879..bb70cbcfd4e 100644 --- a/test/Transforms/LoopVectorize/AArch64/scalable-call.ll +++ b/test/Transforms/LoopVectorize/AArch64/scalable-call.ll @@ -72,10 +72,11 @@ for.end: } define void @vec_intrinsic(i64 %N, double* nocapture readonly %a) { +;; FIXME: Should be calling sin_vec, once the cost of scalarizing is handled. ; CHECK-LABEL: @vec_intrinsic ; CHECK: vector.body: ; CHECK: %[[LOAD:.*]] = load , * -; CHECK: call fast @sin_vec( %[[LOAD]]) +; CHECK: call fast @llvm.sin.nxv2f64( %[[LOAD]]) entry: %cmp7 = icmp sgt i64 %N, 0 br i1 %cmp7, label %for.body, label %for.end @@ -86,6 +87,7 @@ for.body: %0 = load double, double* %arrayidx, align 8 %1 = call fast double @llvm.sin.f64(double %0) #2 %add = fadd fast double %1, 1.000000e+00 + store double %add, double* %arrayidx, align 8 %iv.next = add nuw nsw i64 %iv, 1 %exitcond = icmp eq i64 %iv.next, %N br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !1