From dcd9c105f68fd964f6bd3a5a73adff0413edeee0 Mon Sep 17 00:00:00 2001 From: David Sherwood Date: Fri, 15 Jan 2021 14:12:50 +0000 Subject: [PATCH] [SVE][LoopVectorize] Add support for extracting the last lane of a scalable vector There are certain loops like this below: for (int i = 0; i < n; i++) { a[i] = b[i] + 1; *inv = a[i]; } that can only be vectorised if we are able to extract the last lane of the vectorised form of 'a[i]'. For fixed width vectors this already works since we know at compile time what the final lane is, however for scalable vectors this is a different story. This patch adds support for extracting the last lane from a scalable vector using a runtime determined lane value. I have added support to VPIteration for runtime-determined lanes that still permit the caching of values. I did this by introducing a new class called VPLane, which describes the lane we're dealing with and provides interfaces to get both the compile-time known lane and the runtime determined value. Whilst doing this work I couldn't find any explicit tests for extracting the last lane values of fixed width vectors so I added tests for both scalable and fixed width vectors. Differential Revision: https://reviews.llvm.org/D95139 --- lib/Transforms/Vectorize/LoopVectorize.cpp | 32 ++--- lib/Transforms/Vectorize/VPlan.cpp | 27 ++++- lib/Transforms/Vectorize/VPlan.h | 110 ++++++++++++++++-- .../AArch64/sve-extract-last-veclane.ll | 77 ++++++++++++ .../LoopVectorize/extract-last-veclane.ll | 53 +++++++++ 5 files changed, 269 insertions(+), 30 deletions(-) create mode 100644 test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll create mode 100644 test/Transforms/LoopVectorize/extract-last-veclane.ll diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index b18aa6c4865..ba93dd2812e 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1109,6 +1109,12 @@ static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) { namespace llvm { +/// Return the runtime value for VF. +Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF) { + Constant *EC = ConstantInt::get(Ty, VF.getKnownMinValue()); + return VF.isScalable() ? B.CreateVScale(EC) : EC; +} + void reportVectorizationFailure(const StringRef DebugMsg, const StringRef OREMsg, const StringRef ORETag, OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) { @@ -2555,7 +2561,8 @@ void InnerLoopVectorizer::packScalarIntoVectorValue(VPValue *Def, Value *ScalarInst = State.get(Def, Instance); Value *VectorValue = State.get(Def, Instance.Part); VectorValue = Builder.CreateInsertElement( - VectorValue, ScalarInst, State.Builder.getInt32(Instance.Lane)); + VectorValue, ScalarInst, + Instance.Lane.getAsRuntimeExpr(State.Builder, VF)); State.set(Def, VectorValue, Instance.Part); } @@ -2967,7 +2974,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPValue *Def, auto InputInstance = Instance; if (!Operand || !OrigLoop->contains(Operand) || (Cost->isUniformAfterVectorization(Operand, State.VF))) - InputInstance.Lane = 0; + InputInstance.Lane = VPLane::getFirstLane(); auto *NewOp = State.get(User.getOperand(op), InputInstance); Cloned->setOperand(op, NewOp); } @@ -4439,14 +4446,13 @@ void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { auto *IncomingValue = LCSSAPhi.getIncomingValue(0); // Non-instruction incoming values will have only one value. - unsigned LastLane = 0; - if (isa(IncomingValue)) - LastLane = Cost->isUniformAfterVectorization( - cast(IncomingValue), VF) - ? 0 - : VF.getKnownMinValue() - 1; - assert((!VF.isScalable() || LastLane == 0) && - "scalable vectors dont support non-uniform scalars yet"); + + VPLane Lane = VPLane::getFirstLane(); + if (isa(IncomingValue) && + !Cost->isUniformAfterVectorization(cast(IncomingValue), + VF)) + Lane = VPLane::getLastLaneForVF(VF); + // Can be a loop invariant incoming value or the last scalar value to be // extracted from the vectorized loop. Builder.SetInsertPoint(LoopMiddleBlock->getTerminator()); @@ -4454,7 +4460,7 @@ void InnerLoopVectorizer::fixLCSSAPHIs(VPTransformState &State) { OrigLoop->isLoopInvariant(IncomingValue) ? IncomingValue : State.get(State.Plan->getVPValue(IncomingValue), - VPIteration(UF - 1, LastLane)); + VPIteration(UF - 1, Lane)); LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock); } } @@ -9132,7 +9138,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) { // Insert scalar instance packing it into a vector. if (AlsoPack && State.VF.isVector()) { // If we're constructing lane 0, initialize to start from poison. - if (State.Instance->Lane == 0) { + if (State.Instance->Lane.isFirstLane()) { assert(!State.VF.isScalable() && "VF is assumed to be non scalable."); Value *Poison = PoisonValue::get( VectorType::get(getUnderlyingValue()->getType(), State.VF)); @@ -9160,7 +9166,7 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) { assert(State.Instance && "Branch on Mask works only on single instance."); unsigned Part = State.Instance->Part; - unsigned Lane = State.Instance->Lane; + unsigned Lane = State.Instance->Lane.getKnownLane(); Value *ConditionBit = nullptr; VPValue *BlockInMask = getMask(); diff --git a/lib/Transforms/Vectorize/VPlan.cpp b/lib/Transforms/Vectorize/VPlan.cpp index ea4f0ac2e80..6974502bad7 100644 --- a/lib/Transforms/Vectorize/VPlan.cpp +++ b/lib/Transforms/Vectorize/VPlan.cpp @@ -58,6 +58,19 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) { return OS; } +Value *VPLane::getAsRuntimeExpr(IRBuilder<> &Builder, + const ElementCount &VF) const { + switch (LaneKind) { + case VPLane::Kind::ScalableLast: + // Lane = RuntimeVF - VF.getKnownMinValue() + Lane + return Builder.CreateSub(getRuntimeVF(Builder, Builder.getInt32Ty(), VF), + Builder.getInt32(VF.getKnownMinValue() - Lane)); + case VPLane::Kind::First: + return Builder.getInt32(Lane); + } + llvm_unreachable("Unknown lane kind"); +} + VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def) : SubclassID(SC), UnderlyingVal(UV), Def(Def) { if (Def) @@ -244,18 +257,20 @@ Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) { if (!Def->getDef()) return Def->getLiveInIRValue(); - if (hasScalarValue(Def, Instance)) - return Data.PerPartScalars[Def][Instance.Part][Instance.Lane]; + if (hasScalarValue(Def, Instance)) { + return Data + .PerPartScalars[Def][Instance.Part][Instance.Lane.mapToCacheIndex(VF)]; + } assert(hasVectorValue(Def, Instance.Part)); auto *VecPart = Data.PerPartOutput[Def][Instance.Part]; if (!VecPart->getType()->isVectorTy()) { - assert(Instance.Lane == 0 && "cannot get lane > 0 for scalar"); + assert(Instance.Lane.isFirstLane() && "cannot get lane > 0 for scalar"); return VecPart; } // TODO: Cache created scalar values. - auto *Extract = - Builder.CreateExtractElement(VecPart, Builder.getInt32(Instance.Lane)); + Value *Lane = Instance.Lane.getAsRuntimeExpr(Builder, VF); + auto *Extract = Builder.CreateExtractElement(VecPart, Lane); // set(Def, Extract, Instance); return Extract; } @@ -427,7 +442,7 @@ void VPRegionBlock::execute(VPTransformState *State) { assert(!State->VF.isScalable() && "VF is assumed to be non scalable."); for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF; ++Lane) { - State->Instance->Lane = Lane; + State->Instance->Lane = VPLane(Lane, VPLane::Kind::First); // Visit the VPBlocks connected to \p this, starting from it. for (VPBlockBase *Block : RPOT) { LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n'); diff --git a/lib/Transforms/Vectorize/VPlan.h b/lib/Transforms/Vectorize/VPlan.h index 0a5abef0e1f..5a23c683998 100644 --- a/lib/Transforms/Vectorize/VPlan.h +++ b/lib/Transforms/Vectorize/VPlan.h @@ -60,6 +60,11 @@ class VPRegionBlock; class VPlan; class VPlanSlp; +/// Returns a calculation for the total number of elements for a given \p VF. +/// For fixed width vectors this value is a constant, whereas for scalable +/// vectors it is an expression determined at runtime. +Value *getRuntimeVF(IRBuilder<> &B, Type *Ty, ElementCount VF); + /// A range of powers-of-2 vectorization factors with fixed start and /// adjustable end. The range includes start and excludes end, e.g.,: /// [1, 9) = {1, 2, 4, 8} @@ -89,18 +94,98 @@ using VPlanPtr = std::unique_ptr; /// vectorizer whereas the term "output IR" refers to code that is generated by /// the vectorizer. +/// VPLane provides a way to access lanes in both fixed width and scalable +/// vectors, where for the latter the lane index sometimes needs calculating +/// as a runtime expression. +class VPLane { +public: + /// Kind describes how to interpret Lane. + enum class Kind : uint8_t { + /// For First, Lane is the index into the first N elements of a + /// fixed-vector > or a scalable vector >. + First, + /// For ScalableLast, Lane is the offset from the start of the last + /// N-element subvector in a scalable vector >. For + /// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of + /// 1 corresponds to `((vscale - 1) * N) + 1`, etc. + ScalableLast + }; + +private: + /// in [0..VF) + unsigned Lane; + + /// Indicates how the Lane should be interpreted, as described above. + Kind LaneKind; + +public: + VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {} + + static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); } + + static VPLane getLastLaneForVF(const ElementCount &VF) { + unsigned LaneOffset = VF.getKnownMinValue() - 1; + Kind LaneKind; + if (VF.isScalable()) + // In this case 'LaneOffset' refers to the offset from the start of the + // last subvector with VF.getKnownMinValue() elements. + LaneKind = VPLane::Kind::ScalableLast; + else + LaneKind = VPLane::Kind::First; + return VPLane(LaneOffset, LaneKind); + } + + /// Returns a compile-time known value for the lane index and asserts if the + /// lane can only be calculated at runtime. + unsigned getKnownLane() const { + assert(LaneKind == Kind::First); + return Lane; + } + + /// Returns an expression describing the lane index that can be used at + /// runtime. + Value *getAsRuntimeExpr(IRBuilder<> &Builder, const ElementCount &VF) const; + + /// Returns the Kind of lane offset. + Kind getKind() const { return LaneKind; } + + /// Returns true if this is the first lane of the whole vector. + bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; } + + /// Maps the lane to a cache index based on \p VF. + unsigned mapToCacheIndex(const ElementCount &VF) const { + switch (LaneKind) { + case VPLane::Kind::ScalableLast: + assert(VF.isScalable() && Lane < VF.getKnownMinValue()); + return VF.getKnownMinValue() + Lane; + default: + assert(Lane < VF.getKnownMinValue()); + return Lane; + } + } + + /// Returns the maxmimum number of lanes that we are able to consider + /// caching for \p VF. + static unsigned getNumCachedLanes(const ElementCount &VF) { + return VF.getKnownMinValue() * (VF.isScalable() ? 2 : 1); + } +}; + /// VPIteration represents a single point in the iteration space of the output /// (vectorized and/or unrolled) IR loop. struct VPIteration { /// in [0..UF) unsigned Part; - /// in [0..VF) - unsigned Lane; + VPLane Lane; - VPIteration(unsigned Part, unsigned Lane) : Part(Part), Lane(Lane) {} + VPIteration(unsigned Part, unsigned Lane, + VPLane::Kind Kind = VPLane::Kind::First) + : Part(Part), Lane(Lane, Kind) {} - bool isFirstIteration() const { return Part == 0 && Lane == 0; } + VPIteration(unsigned Part, const VPLane &Lane) : Part(Part), Lane(Lane) {} + + bool isFirstIteration() const { return Part == 0 && Lane.isFirstLane(); } }; /// VPTransformState holds information passed down when "executing" a VPlan, @@ -157,9 +242,10 @@ struct VPTransformState { auto I = Data.PerPartScalars.find(Def); if (I == Data.PerPartScalars.end()) return false; + unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); return Instance.Part < I->second.size() && - Instance.Lane < I->second[Instance.Part].size() && - I->second[Instance.Part][Instance.Lane]; + CacheIdx < I->second[Instance.Part].size() && + I->second[Instance.Part][CacheIdx]; } /// Set the generated Value for a given VPValue and a given Part. @@ -185,10 +271,11 @@ struct VPTransformState { while (PerPartVec.size() <= Instance.Part) PerPartVec.emplace_back(); auto &Scalars = PerPartVec[Instance.Part]; - while (Scalars.size() <= Instance.Lane) + unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); + while (Scalars.size() <= CacheIdx) Scalars.push_back(nullptr); - assert(!Scalars[Instance.Lane] && "should overwrite existing value"); - Scalars[Instance.Lane] = V; + assert(!Scalars[CacheIdx] && "should overwrite existing value"); + Scalars[CacheIdx] = V; } /// Reset an existing scalar value for \p Def and a given \p Instance. @@ -198,9 +285,10 @@ struct VPTransformState { "need to overwrite existing value"); assert(Instance.Part < Iter->second.size() && "need to overwrite existing value"); - assert(Instance.Lane < Iter->second[Instance.Part].size() && + unsigned CacheIdx = Instance.Lane.mapToCacheIndex(VF); + assert(CacheIdx < Iter->second[Instance.Part].size() && "need to overwrite existing value"); - Iter->second[Instance.Part][Instance.Lane] = V; + Iter->second[Instance.Part][CacheIdx] = V; } /// Hold state information used when constructing the CFG of the output IR, diff --git a/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll b/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll new file mode 100644 index 00000000000..2e916b2d726 --- /dev/null +++ b/test/Transforms/LoopVectorize/AArch64/sve-extract-last-veclane.ll @@ -0,0 +1,77 @@ +; RUN: opt -loop-vectorize -dce -instcombine -mtriple aarch64-linux-gnu -S < %s 2>%t | FileCheck %s + +; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t + +; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it. +; WARN-NOT: warning + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define void @inv_store_last_lane(i32* noalias nocapture %a, i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) #0 { +; CHECK-LABEL: @inv_store_last_lane +; CHECK: vector.body: +; CHECK: store %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 +; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1 +; CHECK-NEXT: %{{.*}} = extractelement %[[VEC_VAL]], i32 %[[LAST_LANE]] + +entry: + br label %for.body + +for.body: ; preds = %for.body.lr.ph, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %mul = shl nsw i32 %0, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %mul, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !0 + +exit: ; preds = %for.body + %arrayidx5 = getelementptr inbounds i32, i32* %inv, i64 42 + store i32 %mul, i32* %arrayidx5, align 4 + ret void +} + +define float @ret_last_lane(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) #0 { +; CHECK-LABEL: @ret_last_lane +; CHECK: vector.body: +; CHECK: store %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %[[VSCALE:.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: %[[VSCALE2:.*]] = shl i32 %[[VSCALE]], 2 +; CHECK-NEXT: %[[LAST_LANE:.*]] = add i32 %[[VSCALE2]], -1 +; CHECK-NEXT: %{{.*}} = extractelement %[[VEC_VAL]], i32 %[[LAST_LANE]] + +entry: + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %mul = fmul float %0, 2.000000e+00 + %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body, !llvm.loop !6 + +exit: ; preds = %for.body, %entry + ret float %mul +} + +attributes #0 = { "target-cpu"="generic" "target-features"="+neon,+sve" } + +!0 = distinct !{!0, !1, !2, !3, !4, !5} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.width", i32 4} +!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} +!4 = !{!"llvm.loop.interleave.count", i32 1} +!5 = !{!"llvm.loop.vectorize.enable", i1 true} +!6 = distinct !{!6, !1, !2, !3, !4, !5} diff --git a/test/Transforms/LoopVectorize/extract-last-veclane.ll b/test/Transforms/LoopVectorize/extract-last-veclane.ll new file mode 100644 index 00000000000..e9a4541c01d --- /dev/null +++ b/test/Transforms/LoopVectorize/extract-last-veclane.ll @@ -0,0 +1,53 @@ +; RUN: opt -loop-vectorize -dce -instcombine -S -force-vector-width=4 < %s 2>%t | FileCheck %s + +define void @inv_store_last_lane(i32* noalias nocapture %a, i32* noalias nocapture %inv, i32* noalias nocapture readonly %b, i64 %n) { +; CHECK-LABEL: @inv_store_last_lane +; CHECK: vector.body: +; CHECK: store <4 x i32> %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %{{.*}} = extractelement <4 x i32> %[[VEC_VAL]], i32 3 + +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %b, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %mul = shl nsw i32 %0, 1 + %arrayidx2 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + store i32 %mul, i32* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body + %arrayidx5 = getelementptr inbounds i32, i32* %inv, i64 42 + store i32 %mul, i32* %arrayidx5, align 4 + ret void +} + +define float @ret_last_lane(float* noalias nocapture %a, float* noalias nocapture readonly %b, i64 %n) { +; CHECK-LABEL: @ret_last_lane +; CHECK: vector.body: +; CHECK: store <4 x float> %[[VEC_VAL:.*]], < +; CHECK: middle.block: +; CHECK: %{{.*}} = extractelement <4 x float> %[[VEC_VAL]], i32 3 + +entry: + br label %for.body + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float, float* %b, i64 %indvars.iv + %0 = load float, float* %arrayidx, align 4 + %mul = fmul float %0, 2.000000e+00 + %arrayidx2 = getelementptr inbounds float, float* %a, i64 %indvars.iv + store float %mul, float* %arrayidx2, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %n + br i1 %exitcond.not, label %exit, label %for.body + +exit: ; preds = %for.body, %entry + ret float %mul +}