1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-18 10:32:48 +02:00

[SLP] Support vectorizing functions provided by vector libs.

It seems like the SLPVectorizer is currently not aware of vector
versions of functions provided by libraries like Accelerate [1].
This patch updates SLPVectorizer to use the same infrastructure
the LoopVectorizer uses to detect vectorizable library functions.

For calls, it computes the cost of an intrinsic call (existing behavior)
and the cost of a vector function library call, if available. Like
LoopVectorizer, it assumes the cost of the vector function is simply the
cost of a call to a vector function.

[1] https://developer.apple.com/documentation/accelerate

Reviewers: ABataev, RKSimon, spatel

Reviewed By: ABataev

Differential Revision: https://reviews.llvm.org/D75878
This commit is contained in:
Florian Hahn 2020-03-10 13:03:43 +00:00
parent 13106dbb0f
commit ac56a0239b
5 changed files with 79 additions and 35 deletions

View File

@ -85,6 +85,7 @@
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
@ -3227,6 +3228,39 @@ bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
});
}
std::pair<unsigned, unsigned> getVectorCallCosts(CallInst *CI,
VectorType *VecTy,
TargetTransformInfo *TTI,
TargetLibraryInfo *TLI) {
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
// Calculate the cost of the scalar and vector calls.
FastMathFlags FMF;
if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
FMF = FPMO->getFastMathFlags();
SmallVector<Value *, 4> Args(CI->arg_operands());
int IntrinsicCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
VecTy->getNumElements());
auto Shape =
VFShape::get(*CI, {static_cast<unsigned>(VecTy->getNumElements()), false},
false /*HasGlobalPred*/);
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
int LibCost = IntrinsicCost;
if (!CI->isNoBuiltin() && VecFunc) {
// Calculate the cost of the vector library call.
SmallVector<Type *, 4> VecTys;
for (Use &Arg : CI->args())
VecTys.push_back(
VectorType::get(Arg->getType(), VecTy->getNumElements()));
// If the corresponding vector call is cheaper, return its cost.
LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys);
}
return {IntrinsicCost, LibCost};
}
int BoUpSLP::getEntryCost(TreeEntry *E) {
ArrayRef<Value*> VL = E->Scalars;
@ -3539,9 +3573,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
}
int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
SmallVector<Value *, 4> Args(CI->arg_operands());
int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
VecTy->getNumElements());
auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
int VecCallCost = std::min(VecCallCosts.first, VecCallCosts.second);
LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
<< " (" << VecCallCost << "-" << ScalarCallCost << ")"
@ -4446,13 +4479,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
if (Function *FI = CI->getCalledFunction())
IID = FI->getIntrinsicID();
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
bool UseIntrinsic = VecCallCosts.first <= VecCallCosts.second;
Value *ScalarArg = nullptr;
std::vector<Value *> OpVecs;
for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
ValueList OpVL;
// Some intrinsics have scalar arguments. This argument should not be
// vectorized.
if (hasVectorInstrinsicScalarOpd(IID, j)) {
if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) {
CallInst *CEI = cast<CallInst>(VL0);
ScalarArg = CEI->getArgOperand(j);
OpVecs.push_back(CEI->getArgOperand(j));
@ -4465,9 +4503,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
Module *M = F->getParent();
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
if (!UseIntrinsic) {
VFShape Shape = VFShape::get(
*CI, {static_cast<unsigned>(VecTy->getNumElements()), false},
false /*HasGlobalPred*/);
CF = VFDatabase(*CI).getVectorizedFunction(Shape);
}
SmallVector<OperandBundleDef, 1> OpBundles;
CI->getOperandBundlesAsDefs(OpBundles);
Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
@ -5561,6 +5606,7 @@ struct SLPVectorizer : public FunctionPass {
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<DemandedBitsWrapperPass>();
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
AU.addRequired<InjectTLIMappingsLegacy>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
@ -7476,6 +7522,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }

View File

@ -250,6 +250,7 @@
; CHECK-NEXT: Lazy Branch Probability Analysis
; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Inject TLI Mappings
; CHECK-NEXT: SLP Vectorizer
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Combine redundant instructions

View File

@ -255,6 +255,7 @@
; CHECK-NEXT: Lazy Branch Probability Analysis
; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Inject TLI Mappings
; CHECK-NEXT: SLP Vectorizer
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Combine redundant instructions

View File

@ -237,6 +237,7 @@
; CHECK-NEXT: Lazy Branch Probability Analysis
; CHECK-NEXT: Lazy Block Frequency Analysis
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Inject TLI Mappings
; CHECK-NEXT: SLP Vectorizer
; CHECK-NEXT: Optimization Remark Emitter
; CHECK-NEXT: Combine redundant instructions

View File

@ -8,23 +8,20 @@ target triple = "arm64-apple-ios14.0.0"
declare float @llvm.sin.f32(float) #1
; FIXME: Accelerate provides sin() for <4 x float>
; Accelerate provides sin() for <4 x float>
define <4 x float> @sin_4x(<4 x float>* %a) {
; CHECK-LABEL: @sin_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]])
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
;
; NOACCELERATE-LABEL: @sin_4x(
@ -67,10 +64,10 @@ define <2 x float> @sin_2x(<2 x float>* %a) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16
; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #1
; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #1
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
; CHECK-NEXT: ret <2 x float> [[VECINS_1]]
;
@ -99,23 +96,20 @@ entry:
declare float @llvm.cos.f32(float) #1
; FIXME: Accelerate provides cos() for <4 x float>
; Accelerate provides cos() for <4 x float>
define <4 x float> @cos_4x(<4 x float>* %a) {
; CHECK-LABEL: @cos_4x(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])
; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]])
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
;
; NOACCELERATE-LABEL: @cos_4x(
@ -158,10 +152,10 @@ define <2 x float> @cos_2x(<2 x float>* %a) {
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16
; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #2
; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #2
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
; CHECK-NEXT: ret <2 x float> [[VECINS_1]]
;