mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
[SLP] Support vectorizing functions provided by vector libs.
It seems like the SLPVectorizer is currently not aware of vector versions of functions provided by libraries like Accelerate [1]. This patch updates SLPVectorizer to use the same infrastructure the LoopVectorizer uses to detect vectorizable library functions. For calls, it computes the cost of an intrinsic call (existing behavior) and the cost of a vector function library call, if available. Like LoopVectorizer, it assumes the cost of the vector function is simply the cost of a call to a vector function. [1] https://developer.apple.com/documentation/accelerate Reviewers: ABataev, RKSimon, spatel Reviewed By: ABataev Differential Revision: https://reviews.llvm.org/D75878
This commit is contained in:
parent
13106dbb0f
commit
ac56a0239b
@ -85,6 +85,7 @@
|
|||||||
#include "llvm/Support/KnownBits.h"
|
#include "llvm/Support/KnownBits.h"
|
||||||
#include "llvm/Support/MathExtras.h"
|
#include "llvm/Support/MathExtras.h"
|
||||||
#include "llvm/Support/raw_ostream.h"
|
#include "llvm/Support/raw_ostream.h"
|
||||||
|
#include "llvm/Transforms/Utils/InjectTLIMappings.h"
|
||||||
#include "llvm/Transforms/Utils/LoopUtils.h"
|
#include "llvm/Transforms/Utils/LoopUtils.h"
|
||||||
#include "llvm/Transforms/Vectorize.h"
|
#include "llvm/Transforms/Vectorize.h"
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
@ -3227,6 +3228,39 @@ bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::pair<unsigned, unsigned> getVectorCallCosts(CallInst *CI,
|
||||||
|
VectorType *VecTy,
|
||||||
|
TargetTransformInfo *TTI,
|
||||||
|
TargetLibraryInfo *TLI) {
|
||||||
|
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
|
||||||
|
|
||||||
|
// Calculate the cost of the scalar and vector calls.
|
||||||
|
FastMathFlags FMF;
|
||||||
|
if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
|
||||||
|
FMF = FPMO->getFastMathFlags();
|
||||||
|
|
||||||
|
SmallVector<Value *, 4> Args(CI->arg_operands());
|
||||||
|
int IntrinsicCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
|
||||||
|
VecTy->getNumElements());
|
||||||
|
|
||||||
|
auto Shape =
|
||||||
|
VFShape::get(*CI, {static_cast<unsigned>(VecTy->getNumElements()), false},
|
||||||
|
false /*HasGlobalPred*/);
|
||||||
|
Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
|
||||||
|
int LibCost = IntrinsicCost;
|
||||||
|
if (!CI->isNoBuiltin() && VecFunc) {
|
||||||
|
// Calculate the cost of the vector library call.
|
||||||
|
SmallVector<Type *, 4> VecTys;
|
||||||
|
for (Use &Arg : CI->args())
|
||||||
|
VecTys.push_back(
|
||||||
|
VectorType::get(Arg->getType(), VecTy->getNumElements()));
|
||||||
|
|
||||||
|
// If the corresponding vector call is cheaper, return its cost.
|
||||||
|
LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys);
|
||||||
|
}
|
||||||
|
return {IntrinsicCost, LibCost};
|
||||||
|
}
|
||||||
|
|
||||||
int BoUpSLP::getEntryCost(TreeEntry *E) {
|
int BoUpSLP::getEntryCost(TreeEntry *E) {
|
||||||
ArrayRef<Value*> VL = E->Scalars;
|
ArrayRef<Value*> VL = E->Scalars;
|
||||||
|
|
||||||
@ -3539,9 +3573,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
|
|||||||
}
|
}
|
||||||
int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
|
int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
|
||||||
|
|
||||||
SmallVector<Value *, 4> Args(CI->arg_operands());
|
auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
|
||||||
int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
|
int VecCallCost = std::min(VecCallCosts.first, VecCallCosts.second);
|
||||||
VecTy->getNumElements());
|
|
||||||
|
|
||||||
LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
|
LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
|
||||||
<< " (" << VecCallCost << "-" << ScalarCallCost << ")"
|
<< " (" << VecCallCost << "-" << ScalarCallCost << ")"
|
||||||
@ -4446,13 +4479,18 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||||||
if (Function *FI = CI->getCalledFunction())
|
if (Function *FI = CI->getCalledFunction())
|
||||||
IID = FI->getIntrinsicID();
|
IID = FI->getIntrinsicID();
|
||||||
|
|
||||||
|
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
|
||||||
|
|
||||||
|
auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
|
||||||
|
bool UseIntrinsic = VecCallCosts.first <= VecCallCosts.second;
|
||||||
|
|
||||||
Value *ScalarArg = nullptr;
|
Value *ScalarArg = nullptr;
|
||||||
std::vector<Value *> OpVecs;
|
std::vector<Value *> OpVecs;
|
||||||
for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
|
for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
|
||||||
ValueList OpVL;
|
ValueList OpVL;
|
||||||
// Some intrinsics have scalar arguments. This argument should not be
|
// Some intrinsics have scalar arguments. This argument should not be
|
||||||
// vectorized.
|
// vectorized.
|
||||||
if (hasVectorInstrinsicScalarOpd(IID, j)) {
|
if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) {
|
||||||
CallInst *CEI = cast<CallInst>(VL0);
|
CallInst *CEI = cast<CallInst>(VL0);
|
||||||
ScalarArg = CEI->getArgOperand(j);
|
ScalarArg = CEI->getArgOperand(j);
|
||||||
OpVecs.push_back(CEI->getArgOperand(j));
|
OpVecs.push_back(CEI->getArgOperand(j));
|
||||||
@ -4465,9 +4503,16 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
Module *M = F->getParent();
|
Module *M = F->getParent();
|
||||||
Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
|
|
||||||
Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
|
Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
|
||||||
Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
|
Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
|
||||||
|
|
||||||
|
if (!UseIntrinsic) {
|
||||||
|
VFShape Shape = VFShape::get(
|
||||||
|
*CI, {static_cast<unsigned>(VecTy->getNumElements()), false},
|
||||||
|
false /*HasGlobalPred*/);
|
||||||
|
CF = VFDatabase(*CI).getVectorizedFunction(Shape);
|
||||||
|
}
|
||||||
|
|
||||||
SmallVector<OperandBundleDef, 1> OpBundles;
|
SmallVector<OperandBundleDef, 1> OpBundles;
|
||||||
CI->getOperandBundlesAsDefs(OpBundles);
|
CI->getOperandBundlesAsDefs(OpBundles);
|
||||||
Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
|
Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
|
||||||
@ -5561,6 +5606,7 @@ struct SLPVectorizer : public FunctionPass {
|
|||||||
AU.addRequired<DominatorTreeWrapperPass>();
|
AU.addRequired<DominatorTreeWrapperPass>();
|
||||||
AU.addRequired<DemandedBitsWrapperPass>();
|
AU.addRequired<DemandedBitsWrapperPass>();
|
||||||
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
|
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
|
||||||
|
AU.addRequired<InjectTLIMappingsLegacy>();
|
||||||
AU.addPreserved<LoopInfoWrapperPass>();
|
AU.addPreserved<LoopInfoWrapperPass>();
|
||||||
AU.addPreserved<DominatorTreeWrapperPass>();
|
AU.addPreserved<DominatorTreeWrapperPass>();
|
||||||
AU.addPreserved<AAResultsWrapperPass>();
|
AU.addPreserved<AAResultsWrapperPass>();
|
||||||
@ -7476,6 +7522,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
|
|||||||
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
|
INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
|
||||||
INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
|
INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
|
||||||
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
|
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
|
||||||
|
INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
|
||||||
INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
|
INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
|
||||||
|
|
||||||
Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }
|
Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }
|
||||||
|
@ -250,6 +250,7 @@
|
|||||||
; CHECK-NEXT: Lazy Branch Probability Analysis
|
; CHECK-NEXT: Lazy Branch Probability Analysis
|
||||||
; CHECK-NEXT: Lazy Block Frequency Analysis
|
; CHECK-NEXT: Lazy Block Frequency Analysis
|
||||||
; CHECK-NEXT: Optimization Remark Emitter
|
; CHECK-NEXT: Optimization Remark Emitter
|
||||||
|
; CHECK-NEXT: Inject TLI Mappings
|
||||||
; CHECK-NEXT: SLP Vectorizer
|
; CHECK-NEXT: SLP Vectorizer
|
||||||
; CHECK-NEXT: Optimization Remark Emitter
|
; CHECK-NEXT: Optimization Remark Emitter
|
||||||
; CHECK-NEXT: Combine redundant instructions
|
; CHECK-NEXT: Combine redundant instructions
|
||||||
|
@ -255,6 +255,7 @@
|
|||||||
; CHECK-NEXT: Lazy Branch Probability Analysis
|
; CHECK-NEXT: Lazy Branch Probability Analysis
|
||||||
; CHECK-NEXT: Lazy Block Frequency Analysis
|
; CHECK-NEXT: Lazy Block Frequency Analysis
|
||||||
; CHECK-NEXT: Optimization Remark Emitter
|
; CHECK-NEXT: Optimization Remark Emitter
|
||||||
|
; CHECK-NEXT: Inject TLI Mappings
|
||||||
; CHECK-NEXT: SLP Vectorizer
|
; CHECK-NEXT: SLP Vectorizer
|
||||||
; CHECK-NEXT: Optimization Remark Emitter
|
; CHECK-NEXT: Optimization Remark Emitter
|
||||||
; CHECK-NEXT: Combine redundant instructions
|
; CHECK-NEXT: Combine redundant instructions
|
||||||
|
@ -237,6 +237,7 @@
|
|||||||
; CHECK-NEXT: Lazy Branch Probability Analysis
|
; CHECK-NEXT: Lazy Branch Probability Analysis
|
||||||
; CHECK-NEXT: Lazy Block Frequency Analysis
|
; CHECK-NEXT: Lazy Block Frequency Analysis
|
||||||
; CHECK-NEXT: Optimization Remark Emitter
|
; CHECK-NEXT: Optimization Remark Emitter
|
||||||
|
; CHECK-NEXT: Inject TLI Mappings
|
||||||
; CHECK-NEXT: SLP Vectorizer
|
; CHECK-NEXT: SLP Vectorizer
|
||||||
; CHECK-NEXT: Optimization Remark Emitter
|
; CHECK-NEXT: Optimization Remark Emitter
|
||||||
; CHECK-NEXT: Combine redundant instructions
|
; CHECK-NEXT: Combine redundant instructions
|
||||||
|
@ -8,23 +8,20 @@ target triple = "arm64-apple-ios14.0.0"
|
|||||||
declare float @llvm.sin.f32(float) #1
|
declare float @llvm.sin.f32(float) #1
|
||||||
|
|
||||||
|
|
||||||
; FIXME: Accelerate provides sin() for <4 x float>
|
; Accelerate provides sin() for <4 x float>
|
||||||
define <4 x float> @sin_4x(<4 x float>* %a) {
|
define <4 x float> @sin_4x(<4 x float>* %a) {
|
||||||
; CHECK-LABEL: @sin_4x(
|
; CHECK-LABEL: @sin_4x(
|
||||||
; CHECK-NEXT: entry:
|
; CHECK-NEXT: entry:
|
||||||
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
|
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
|
||||||
; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
|
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vsinf(<4 x float> [[TMP0]])
|
||||||
; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
|
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
|
||||||
; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
|
; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
|
||||||
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
|
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
|
||||||
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
|
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
|
||||||
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
|
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
|
||||||
; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
|
; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
|
||||||
; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_2]])
|
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
|
||||||
; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
|
; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
|
||||||
; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
|
|
||||||
; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_3]])
|
|
||||||
; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
|
|
||||||
; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
|
; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
|
||||||
;
|
;
|
||||||
; NOACCELERATE-LABEL: @sin_4x(
|
; NOACCELERATE-LABEL: @sin_4x(
|
||||||
@ -67,10 +64,10 @@ define <2 x float> @sin_2x(<2 x float>* %a) {
|
|||||||
; CHECK-NEXT: entry:
|
; CHECK-NEXT: entry:
|
||||||
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16
|
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16
|
||||||
; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
|
; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
|
||||||
; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]])
|
; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT]]) #1
|
||||||
; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
|
; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
|
||||||
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
|
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
|
||||||
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]])
|
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.sin.f32(float [[VECEXT_1]]) #1
|
||||||
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
|
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
|
||||||
; CHECK-NEXT: ret <2 x float> [[VECINS_1]]
|
; CHECK-NEXT: ret <2 x float> [[VECINS_1]]
|
||||||
;
|
;
|
||||||
@ -99,23 +96,20 @@ entry:
|
|||||||
|
|
||||||
declare float @llvm.cos.f32(float) #1
|
declare float @llvm.cos.f32(float) #1
|
||||||
|
|
||||||
; FIXME: Accelerate provides cos() for <4 x float>
|
; Accelerate provides cos() for <4 x float>
|
||||||
define <4 x float> @cos_4x(<4 x float>* %a) {
|
define <4 x float> @cos_4x(<4 x float>* %a) {
|
||||||
; CHECK-LABEL: @cos_4x(
|
; CHECK-LABEL: @cos_4x(
|
||||||
; CHECK-NEXT: entry:
|
; CHECK-NEXT: entry:
|
||||||
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
|
; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[A:%.*]], align 16
|
||||||
; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <4 x float> [[TMP0]], i32 0
|
; CHECK-NEXT: [[TMP1:%.*]] = call fast <4 x float> @vcosf(<4 x float> [[TMP0]])
|
||||||
; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
|
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[TMP1]], i32 0
|
||||||
; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP1]], i32 0
|
; CHECK-NEXT: [[VECINS:%.*]] = insertelement <4 x float> undef, float [[TMP2]], i32 0
|
||||||
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
|
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[TMP1]], i32 1
|
||||||
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
|
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP3]], i32 1
|
||||||
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
|
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
|
||||||
; CHECK-NEXT: [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
|
; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP4]], i32 2
|
||||||
; CHECK-NEXT: [[TMP3:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_2]])
|
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[TMP1]], i32 3
|
||||||
; CHECK-NEXT: [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
|
; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP5]], i32 3
|
||||||
; CHECK-NEXT: [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
|
|
||||||
; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_3]])
|
|
||||||
; CHECK-NEXT: [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
|
|
||||||
; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
|
; CHECK-NEXT: ret <4 x float> [[VECINS_3]]
|
||||||
;
|
;
|
||||||
; NOACCELERATE-LABEL: @cos_4x(
|
; NOACCELERATE-LABEL: @cos_4x(
|
||||||
@ -158,10 +152,10 @@ define <2 x float> @cos_2x(<2 x float>* %a) {
|
|||||||
; CHECK-NEXT: entry:
|
; CHECK-NEXT: entry:
|
||||||
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16
|
; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, <2 x float>* [[A:%.*]], align 16
|
||||||
; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
|
; CHECK-NEXT: [[VECEXT:%.*]] = extractelement <2 x float> [[TMP0]], i32 0
|
||||||
; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]])
|
; CHECK-NEXT: [[TMP1:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT]]) #2
|
||||||
; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
|
; CHECK-NEXT: [[VECINS:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
|
||||||
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
|
; CHECK-NEXT: [[VECEXT_1:%.*]] = extractelement <2 x float> [[TMP0]], i32 1
|
||||||
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]])
|
; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.cos.f32(float [[VECEXT_1]]) #2
|
||||||
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
|
; CHECK-NEXT: [[VECINS_1:%.*]] = insertelement <2 x float> [[VECINS]], float [[TMP2]], i32 1
|
||||||
; CHECK-NEXT: ret <2 x float> [[VECINS_1]]
|
; CHECK-NEXT: ret <2 x float> [[VECINS_1]]
|
||||||
;
|
;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user