mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-26 04:32:44 +01:00
[SVE][LoopVectorize] Add support for extracting the last lane of a scalable vector
There are certain loops like this below: for (int i = 0; i < n; i++) { a[i] = b[i] + 1; *inv = a[i]; } that can only be vectorised if we are able to extract the last lane of the vectorised form of 'a[i]'. For fixed width vectors this already works since we know at compile time what the final lane is, however for scalable vectors this is a different story. This patch adds support for extracting the last lane from a scalable vector using a runtime determined lane value. I have added support to VPIteration for runtime-determined lanes that still permit the caching of values. I did this by introducing a new class called VPLane, which describes the lane we're dealing with and provides interfaces to get both the compile-time known lane and the runtime determined value. Whilst doing this work I couldn't find any explicit tests for extracting the last lane values of fixed width vectors so I added tests for both scalable and fixed width vectors. Differential Revision: https://reviews.llvm.org/D95139
This commit is contained in:
parent
6d55f9cfe4
commit
dcd9c105f6
@ -1109,6 +1109,12 @@ static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
|
||||
|
||||
namespace llvm {
|
||||
|
||||
/// Return the runtime value for VF.
|
||||
Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) {
|
||||
Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue());
|
||||
return VF.isScalable() ? B.CreateVScale(EC) : EC;
|
||||
}
|
||||
|
||||
void reportVectorizationFailure(const StringRef DebugMsg,
|
||||
const StringRef OREMsg, const StringRef ORETag,
|
||||
OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
|
||||
@ -2555,7 +2561,8 @@ void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def,
|
||||
Value *ScalarInst = State.get(Def, Instance);
|
||||
Value *VectorValue = State.get(Def, Instance.Part);
|
||||
VectorValue = Builder.CreateInsertElement(
|
||||
VectorValue, ScalarInst, State.Builder.getInt32(Instance.Lane));
|
||||
VectorValue, ScalarInst,
|
||||
Instance.Lane.getAsRuntimeExpr(State.Builder, VF));
|
||||
State.set(Def, VectorValue, Instance.Part);
|
||||
}
|
||||
|
||||
@ -2967,7 +2974,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def,
|
||||
auto InputInstance = Instance;
|
||||
if (!Operand || !OrigLoop->contains(Operand) ||
|
||||
(Cost->isUniformAfterVectorization(Operand, State.VF)))
|
||||
InputInstance.Lane = 0;
|
||||
InputInstance.Lane = VPLane::getFirstLane();
|
||||
auto *NewOp = State.get(User.getOperand(op), InputInstance);
|
||||
Cloned->setOperand(op, NewOp);
|
||||
}
|
||||
@ -4439,14 +4446,13 @@ void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
|
||||
|
||||
auto *IncomingValue = LCSSAPhi.getIncomingValue(0);
|
||||
// Non-instruction incoming values will have only one value.
|
||||
unsigned LastLane = 0;
|
||||
if (isa<Instruction>(IncomingValue))
|
||||
LastLane = Cost->isUniformAfterVectorization(
|
||||
cast<Instruction>(IncomingValue), VF)
|
||||
? 0
|
||||
: VF.getKnownMinValue() - 1;
|
||||
assert((!VF.isScalable() || LastLane == 0) &&
|
||||
"scalable vectors dont support non-uniform scalars yet");
|
||||
|
||||
VPLane Lane = VPLane::getFirstLane();
|
||||
if (isa<Instruction>(IncomingValue) &&
|
||||
!Cost->isUniformAfterVectorization(cast<Instruction>(IncomingValue),
|
||||
VF))
|
||||
Lane = VPLane::getLastLaneForVF(VF);
|
||||
|
||||
// Can be a loop invariant incoming value or the last scalar value to be
|
||||
// extracted from the vectorized loop.
|
||||
Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
|
||||
@ -4454,7 +4460,7 @@ void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) {
|
||||
OrigLoop->isLoopInvariant(IncomingValue)
|
||||
? IncomingValue
|
||||
: State.get(State.Plan->getVPValue(IncomingValue),
|
||||
VPIteration(UF - 1, LastLane));
|
||||
VPIteration(UF - 1, Lane));
|
||||
LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
|
||||
}
|
||||
}
|
||||
@ -9132,7 +9138,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
|
||||
// Insert scalar instance packing it into a vector.
|
||||
if (AlsoPack && State.VF.isVector()) {
|
||||
// If we're constructing lane 0, initialize to start from poison.
|
||||
if (State.Instance->Lane == 0) {
|
||||
if (State.Instance->Lane.isFirstLane()) {
|
||||
assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
|
||||
Value *Poison = PoisonValue::get(
|
||||
VectorType::get(getUnderlyingValue()->getType(), State.VF));
|
||||
@ -9160,7 +9166,7 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
|
||||
assert(State.Instance && "Branch on Mask works only on single instance.");
|
||||
|
||||
unsigned Part = State.Instance->Part;
|
||||
unsigned Lane = State.Instance->Lane;
|
||||
unsigned Lane = State.Instance->Lane.getKnownLane();
|
||||
|
||||
Value *ConditionBit = nullptr;
|
||||
VPValue *BlockInMask = getMask();
|
||||
|
@ -58,6 +58,19 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) {
|
||||
return OS;
|
||||
}
|
||||
|
||||
Value *VPLane::getAsRuntimeExpr(IRBuilder<> &Builder,
|
||||
const ElementCount &VF) const {
|
||||
switch (LaneKind) {
|
||||
case VPLane::Kind::ScalableLast:
|
||||
// Lane = RuntimeVF - VF.getKnownMinValue() + Lane
|
||||
return Builder.CreateSub(getRuntimeVF(Builder, Builder.getInt32Ty(), VF),
|
||||
Builder.getInt32(VF.getKnownMinValue() - Lane));
|
||||
case VPLane::Kind::First:
|
||||
return Builder.getInt32(Lane);
|
||||
}
|
||||
llvm_unreachable("Unknown lane kind");
|
||||
}
|
||||
|
||||
VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def)
|
||||
: SubclassID(SC), UnderlyingVal(UV), Def(Def) {
|
||||
if (Def)
|
||||
@ -244,18 +257,20 @@ Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
|
||||
if (!Def->getDef())
|
||||
return Def->getLiveInIRValue();
|
||||
|
||||
if (hasScalarValue(Def, Instance))
|
||||
return Data.PerPartScalars[Def][Instance.Part][Instance.Lane];
|
||||
if (hasScalarValue(Def, Instance)) {
|
||||
return Data
|
||||
.PerPartScalars[Def][Instance.Part][Instance.Lane.mapToCacheIndex(VF)];
|
||||
}
|
||||
|
||||
assert(hasVectorValue(Def, Instance.Part));
|
||||
auto *VecPart = Data.PerPartOutput[Def][Instance.Part];
|
||||
if (!VecPart->getType()->isVectorTy()) {
|
||||
assert(Instance.Lane == 0 && "cannot get lane > 0 for scalar");
|
||||
assert(Instance.Lane.isFirstLane() && "cannot get lane > 0 for scalar");
|
||||
return VecPart;
|
||||
}
|
||||
// TODO: Cache created scalar values.
|
||||
auto *Extract =
|
||||
Builder.CreateExtractElement(VecPart, Builder.getInt32(Instance.Lane));
|
||||
Value *Lane = Instance.Lane.getAsRuntimeExpr(Builder, VF);
|
||||
auto *Extract = Builder.CreateExtractElement(VecPart, Lane);
|
||||
// set(Def, Extract, Instance);
|
||||
return Extract;
|
||||
}
|
||||
@ -427,7 +442,7 @@ void VPRegionBlock::execute(VPTransformState *State) {
|
||||
assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
|
||||
for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
|
||||
++Lane) {
|
||||
State->Instance->Lane = Lane;
|
||||
State->Instance->Lane = VPLane(Lane, VPLane::Kind::First);
|
||||
// Visit the VPBlocks connected to \p this, starting from it.
|
||||
for (VPBlockBase *Block : RPOT) {
|
||||
LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
|
||||
|
@ -60,6 +60,11 @@ class VPRegionBlock;
|
||||
class VPlan;
|
||||
class VPlanSlp;
|
||||
|
||||
/// Returns a calculation for the total number of elements for a given \p VF.
|
||||
/// For fixed width vectors this value is a constant, whereas for scalable
|
||||
/// vectors it is an expression determined at runtime.
|
||||
Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF);
|
||||
|
||||
/// A range of powers-of-2 vectorization factors with fixed start and
|
||||
/// adjustable end. The range includes start and excludes end, e.g.,:
|
||||
/// [1, 9) = {1, 2, 4, 8}
|
||||
@ -89,18 +94,98 @@ using VPlanPtr = std::unique_ptr<VPlan>;
|
||||
/// vectorizer whereas the term "output IR" refers to code that is generated by
|
||||
/// the vectorizer.
|
||||
|
||||
/// VPLane provides a way to access lanes in both fixed width and scalable
|
||||
/// vectors, where for the latter the lane index sometimes needs calculating
|
||||
/// as a runtime expression.
|
||||
class VPLane {
|
||||
public:
|
||||
/// Kind describes how to interpret Lane.
|
||||
enum class Kind : uint8_t {
|
||||
/// For First, Lane is the index into the first N elements of a
|
||||
/// fixed-vector <N x <ElTy>> or a scalable vector <vscale x N x <ElTy>>.
|
||||
First,
|
||||
/// For ScalableLast, Lane is the offset from the start of the last
|
||||
/// N-element subvector in a scalable vector <vscale x N x <ElTy>>. For
|
||||
/// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of
|
||||
/// 1 corresponds to `((vscale - 1) * N) + 1`, etc.
|
||||
ScalableLast
|
||||
};
|
||||
|
||||
private:
|
||||
/// in [0..VF)
|
||||
unsigned Lane;
|
||||
|
||||
/// Indicates how the Lane should be interpreted, as described above.
|
||||
Kind LaneKind;
|
||||
|
||||
public:
|
||||
VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {}
|
||||
|
||||
static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); }
|
||||
|
||||
static VPLane getLastLaneForVF(const ElementCount &VF) {
|
||||
unsigned LaneOffset = VF.getKnownMinValue() - 1;
|
||||
Kind LaneKind;
|
||||
if (VF.isScalable())
|
||||
// In this case 'LaneOffset' refers to the offset from the start of the
|
||||
// last subvector with VF.getKnownMinValue() elements.
|
||||
LaneKind = VPLane::Kind::ScalableLast;
|
||||
else
|
||||
LaneKind = VPLane::Kind::First;
|
||||
return VPLane(LaneOffset, LaneKind);
|
||||
}
|
||||
|
||||
/// Returns a compile-time known value for the lane index and asserts if the
|
||||
/// lane can only be calculated at runtime.
|
||||
unsigned getKnownLane() const {
|
||||
assert(LaneKind == Kind::First);
|
||||
return Lane;
|
||||
}
|
||||
|
||||
/// Returns an expression describing the lane index that can be used at
|
||||
/// runtime.
|
||||
Value *getAsRuntimeExpr(IRBuilder<> &Builder, const ElementCount &VF) const;
|
||||
|
||||
/// Returns the Kind of lane offset.
|
||||
Kind getKind() const { return LaneKind; }
|
||||
|
||||
/// Returns true if this is the first lane of the whole vector.
|
||||
bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; }
|
||||
|
||||
/// Maps the lane to a cache index based on \p VF.
|
||||
unsigned mapToCacheIndex(const ElementCount &VF) const {
|
||||
switch (LaneKind) {
|
||||
case VPLane::Kind::ScalableLast:
|
||||
assert(VF.isScalable() && Lane < VF.getKnownMinValue());
|
||||
return VF.getKnownMinValue() + Lane;
|
||||
default:
|
||||
assert(Lane < VF.getKnownMinValue());
|
||||
return Lane;
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the maxmimum number of lanes that we are able to consider
|
||||
/// caching for \p VF.
|
||||
static unsigned getNumCachedLanes(const ElementCount &VF) {
|
||||
return VF.getKnownMinValue() * (VF.isScalable() ? 2 : 1);
|
||||
}
|
||||
};
|
||||
|
||||
/// VPIteration represents a single point in the iteration space of the output
|
||||
/// (vectorized and/or unrolled) IR loop.
|
||||
struct VPIteration {
|
||||
/// in [0..UF)
|
||||
unsigned Part;
|
||||
|
||||
/// in [0..VF)
|
||||
unsigned Lane;
|
||||
VPLane Lane;
|
||||
|
||||
VPIteration(unsigned Part, unsigned Lane) : Part(Part), Lane(Lane) {}
|
||||
VPIteration(unsigned Part, unsigned Lane,
|
||||
VPLane::Kind Kind = VPLane::Kind::First)
|
||||
: Part(Part), Lane(Lane, Kind) {}
|
||||
|
||||
bool isFirstIteration() const { return Part == 0 && Lane == 0; }
|
||||
VPIteration(unsigned Part, const VPLane &Lane) : Part(Part), Lane(Lane) {}
|
||||
|
||||
bool isFirstIteration() const { return Part == 0 && Lane.isFirstLane(); }
|
||||
};
|
||||
|
||||
/// VPTransformState holds information passed down when "executing" a VPlan,
|
||||
@ -157,9 +242,10 @@ struct VPTransformState {
|
||||
auto I = Data.PerPartScalars.find(Def);
|
||||
if (I == Data.PerPartScalars.end())
|
||||
return false;
|
||||
unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
|
||||
return Instance.Part < I->second.size() &&
|
||||
Instance.Lane < I->second[Instance.Part].size() &&
|
||||
I->second[Instance.Part][Instance.Lane];
|
||||
CacheIdx < I->second[Instance.Part].size() &&
|
||||
I->second[Instance.Part][CacheIdx];
|
||||
}
|
||||
|
||||
/// Set the generated Value for a given VPValue and a given Part.
|
||||
@ -185,10 +271,11 @@ struct VPTransformState {
|
||||
while (PerPartVec.size() <= Instance.Part)
|
||||
PerPartVec.emplace_back();
|
||||
auto &Scalars = PerPartVec[Instance.Part];
|
||||
while (Scalars.size() <= Instance.Lane)
|
||||
unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
|
||||
while (Scalars.size() <= CacheIdx)
|
||||
Scalars.push_back(nullptr);
|
||||
assert(!Scalars[Instance.Lane] && "should overwrite existing value");
|
||||
Scalars[Instance.Lane] = V;
|
||||
assert(!Scalars[CacheIdx] && "should overwrite existing value");
|
||||
Scalars[CacheIdx] = V;
|
||||
}
|
||||
|
||||
/// Reset an existing scalar value for \p Def and a given \p Instance.
|
||||
@ -198,9 +285,10 @@ struct VPTransformState {
|
||||
"need to overwrite existing value");
|
||||
assert(Instance.Part < Iter->second.size() &&
|
||||
"need to overwrite existing value");
|
||||
assert(Instance.Lane < Iter->second[Instance.Part].size() &&
|
||||
unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF);
|
||||
assert(CacheIdx < Iter->second[Instance.Part].size() &&
|
||||
"need to overwrite existing value");
|
||||
Iter->second[Instance.Part][Instance.Lane] = V;
|
||||
Iter->second[Instance.Part][CacheIdx] = V;
|
||||
}
|
||||
|
||||
/// Hold state information used when constructing the CFG of the output IR,
|
||||
|
@ -0,0 +1,77 @@
|
||||
; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s
|
||||
|
||||
; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
|
||||
|
||||
; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
|
||||
; WARN-NOT: warning
|
||||
|
||||
target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
|
||||
target triple = "aarch64-unknown-linux-gnu"
|
||||
|
||||
define void @inv_store_last_lane(i32* noalias nocapture %a, i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) #0 {
|
||||
; CHECK-LABEL: @inv_store_last_lane
|
||||
; CHECK: vector.body:
|
||||
; CHECK: store <vscale x 4 x i32> %[[VEC_VAL:.*]], <
|
||||
; CHECK: middle.block:
|
||||
; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
|
||||
; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
|
||||
; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1
|
||||
; CHECK-NEXT: %{{.*}} = extractelement <vscale x 4 x i32> %[[VEC_VAL]], i32 %[[LAST_LANE]]
|
||||
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
for.body: ; preds = %for.body.lr.ph, %for.body
|
||||
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
|
||||
%arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
|
||||
%0 = load i32, i32* %arrayidx, align 4
|
||||
%mul = shl nsw i32 %0, 1
|
||||
%arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
|
||||
store i32 %mul, i32* %arrayidx2, align 4
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond.not = icmp eq i64 %indvars.iv.next, %n
|
||||
br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !0
|
||||
|
||||
exit: ; preds = %for.body
|
||||
%arrayidx5 = getelementptr inbounds i32, i32* %inv, i64 42
|
||||
store i32 %mul, i32* %arrayidx5, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define float @ret_last_lane(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) #0 {
|
||||
; CHECK-LABEL: @ret_last_lane
|
||||
; CHECK: vector.body:
|
||||
; CHECK: store <vscale x 4 x float> %[[VEC_VAL:.*]], <
|
||||
; CHECK: middle.block:
|
||||
; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32()
|
||||
; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2
|
||||
; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1
|
||||
; CHECK-NEXT: %{{.*}} = extractelement <vscale x 4 x float> %[[VEC_VAL]], i32 %[[LAST_LANE]]
|
||||
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
for.body: ; preds = %for.body.preheader, %for.body
|
||||
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
|
||||
%arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
|
||||
%0 = load float, float* %arrayidx, align 4
|
||||
%mul = fmul float %0, 2.000000e+00
|
||||
%arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv
|
||||
store float %mul, float* %arrayidx2, align 4
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond.not = icmp eq i64 %indvars.iv.next, %n
|
||||
br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6
|
||||
|
||||
exit: ; preds = %for.body, %entry
|
||||
ret float %mul
|
||||
}
|
||||
|
||||
attributes #0 = { "target-cpu"="generic" "target-features"="+neon,+sve" }
|
||||
|
||||
!0 = distinct !{!0, !1, !2, !3, !4, !5}
|
||||
!1 = !{!"llvm.loop.mustprogress"}
|
||||
!2 = !{!"llvm.loop.vectorize.width", i32 4}
|
||||
!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
|
||||
!4 = !{!"llvm.loop.interleave.count", i32 1}
|
||||
!5 = !{!"llvm.loop.vectorize.enable", i1 true}
|
||||
!6 = distinct !{!6, !1, !2, !3, !4, !5}
|
53
test/Transforms/LoopVectorize/extract-last-veclane.ll
Normal file
53
test/Transforms/LoopVectorize/extract-last-veclane.ll
Normal file
@ -0,0 +1,53 @@
|
||||
; RUN: opt -loop-vectorize -dce -instcombine -S -force-vector-width=4 < %s 2>%t | FileCheck %s
|
||||
|
||||
define void @inv_store_last_lane(i32* noalias nocapture %a, i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) {
|
||||
; CHECK-LABEL: @inv_store_last_lane
|
||||
; CHECK: vector.body:
|
||||
; CHECK: store <4 x i32> %[[VEC_VAL:.*]], <
|
||||
; CHECK: middle.block:
|
||||
; CHECK: %{{.*}} = extractelement <4 x i32> %[[VEC_VAL]], i32 3
|
||||
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
for.body: ; preds = %entry, %for.body
|
||||
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
|
||||
%arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
|
||||
%0 = load i32, i32* %arrayidx, align 4
|
||||
%mul = shl nsw i32 %0, 1
|
||||
%arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
|
||||
store i32 %mul, i32* %arrayidx2, align 4
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond.not = icmp eq i64 %indvars.iv.next, %n
|
||||
br i1 %exitcond.not, label %exit, label %for.body
|
||||
|
||||
exit: ; preds = %for.body
|
||||
%arrayidx5 = getelementptr inbounds i32, i32* %inv, i64 42
|
||||
store i32 %mul, i32* %arrayidx5, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define float @ret_last_lane(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) {
|
||||
; CHECK-LABEL: @ret_last_lane
|
||||
; CHECK: vector.body:
|
||||
; CHECK: store <4 x float> %[[VEC_VAL:.*]], <
|
||||
; CHECK: middle.block:
|
||||
; CHECK: %{{.*}} = extractelement <4 x float> %[[VEC_VAL]], i32 3
|
||||
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
for.body: ; preds = %for.body.preheader, %for.body
|
||||
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
|
||||
%arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv
|
||||
%0 = load float, float* %arrayidx, align 4
|
||||
%mul = fmul float %0, 2.000000e+00
|
||||
%arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv
|
||||
store float %mul, float* %arrayidx2, align 4
|
||||
%indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
|
||||
%exitcond.not = icmp eq i64 %indvars.iv.next, %n
|
||||
br i1 %exitcond.not, label %exit, label %for.body
|
||||
|
||||
exit: ; preds = %for.body, %entry
|
||||
ret float %mul
|
||||
}
|
Loading…
Reference in New Issue
Block a user