1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-18 18:42:46 +02:00

[LoopVectorize] Fix scalarisation crash in widenPHIInstruction for scalable vectors

In InnerLoopVectorizer::widenPHIInstruction there are cases where we have
to scalarise a pointer induction variable after vectorisation. For scalable
vectors we already deal with the case where the pointer induction variable
is uniform, but we currently crash if not uniform. For fixed width vectors
we calculate every lane of the scalarised pointer induction variable for a
given VF, however this cannot work for scalable vectors. In this case I
have added support for caching the whole vector value for each unrolled
part so that we can always extract an arbitrary element. Additionally, we
still continue to cache the known minimum number of lanes too in order
to improve code quality by avoiding an extractelement operation.

I have adapted an existing test `pointer_iv_mixed` from the file:

  Transforms/LoopVectorize/consecutive-ptr-uniforms.ll

and added it here for scalable vectors instead:

  Transforms/LoopVectorize/AArch64/sve-widen-phi.ll

Differential Revision: https://reviews.llvm.org/D101294
This commit is contained in:
David Sherwood 2021-04-21 16:36:11 +01:00
parent 7081b98fbb
commit 0d70233bde
2 changed files with 97 additions and 7 deletions

View File

@ -3335,8 +3335,8 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
SCEVExpander Exp(*SE, DL, "induction");
auto Step = ID.getStep();
auto StartValue = ID.getStartValue();
assert(Index->getType() == Step->getType() &&
"Index type does not match StepValue type");
assert(Index->getType()->getScalarType() == Step->getType() &&
"Index scalar type does not match StepValue type");
// Note: the IR at this point is broken. We cannot use SE to create any new
// SCEV and then expand it, hoping that SCEV's simplification will give us
@ -3355,14 +3355,20 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
return B.CreateAdd(X, Y);
};
// We allow X to be a vector type, in which case Y will potentially be
// splatted into a vector with the same element count.
auto CreateMul = [&B](Value *X, Value *Y) {
assert(X->getType() == Y->getType() && "Types don't match!");
assert(X->getType()->getScalarType() == Y->getType() &&
"Types don't match!");
if (auto *CX = dyn_cast<ConstantInt>(X))
if (CX->isOne())
return Y;
if (auto *CY = dyn_cast<ConstantInt>(Y))
if (CY->isOne())
return X;
VectorType *XVTy = dyn_cast<VectorType>(X->getType());
if (XVTy && !isa<VectorType>(Y->getType()))
Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
return B.CreateMul(X, Y);
};
@ -3381,6 +3387,8 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
switch (ID.getKind()) {
case InductionDescriptor::IK_IntInduction: {
assert(!isa<VectorType>(Index->getType()) &&
"Vector indices not supported for integer inductions yet");
assert(Index->getType() == StartValue->getType() &&
"Index type does not match StartValue type");
if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
@ -3395,9 +3403,12 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
return B.CreateGEP(
StartValue->getType()->getPointerElementType(), StartValue,
CreateMul(Index,
Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
GetInsertPoint())));
}
case InductionDescriptor::IK_FpInduction: {
assert(!isa<VectorType>(Index->getType()) &&
"Vector indices not supported for FP inductions yet");
assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
auto InductionBinOp = ID.getInductionBinOp();
assert(InductionBinOp &&
@ -4816,13 +4827,33 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
// iteration. If the instruction is uniform, we only need to generate the
// first lane. Otherwise, we generate all VF values.
bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
assert((IsUniform || !VF.isScalable()) &&
"Currently unsupported for scalable vectors");
unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
bool NeedsVectorIndex = !IsUniform && VF.isScalable();
Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr;
if (NeedsVectorIndex) {
Type *VecIVTy = VectorType::get(PtrInd->getType(), VF);
UnitStepVec = Builder.CreateStepVector(VecIVTy);
PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd);
}
for (unsigned Part = 0; Part < UF; ++Part) {
Value *PartStart = createStepForVF(
Builder, ConstantInt::get(PtrInd->getType(), Part), VF);
if (NeedsVectorIndex) {
Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
Value *SclrGep =
emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
SclrGep->setName("next.gep");
State.set(PhiR, SclrGep, Part);
// We've cached the whole vector, which means we can support the
// extraction of any lane.
continue;
}
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
Value *Idx = Builder.CreateAdd(
PartStart, ConstantInt::get(PtrInd->getType(), Lane));

View File

@ -114,9 +114,68 @@ for.cond.cleanup: ; preds = %for.body
ret void
}
;
; Check multiple pointer induction variables where only one is recognized as
; uniform and remains uniform after vectorization. The other pointer induction
; variable is not recognized as uniform and is not uniform after vectorization
; because it is stored to memory.
;
define i32 @pointer_iv_mixed(i32* noalias %a, i32** noalias %b, i64 %n) {
; CHECK-LABEL: @pointer_iv_mixed(
; CHECK: vector.body
; CHECK: %[[IDX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ]
; CHECK: %[[STEPVEC:.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
; CHECK-NEXT: %[[TMP1:.*]] = insertelement <vscale x 2 x i64> poison, i64 %[[IDX]], i32 0
; CHECK-NEXT: %[[TMP2:.*]] = shufflevector <vscale x 2 x i64> %[[TMP1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: %[[VECIND1:.*]] = add <vscale x 2 x i64> %[[TMP2]], %[[STEPVEC]]
; CHECK-NEXT: %[[APTRS1:.*]] = getelementptr i32, i32* %a, <vscale x 2 x i64> %[[VECIND1]]
; CHECK-NEXT: %[[VSCALE64:.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: %[[VSCALE64X2:.*]] = shl i64 %[[VSCALE64]], 1
; CHECK-NEXT: %[[TMP3:.*]] = insertelement <vscale x 2 x i64> poison, i64 %[[VSCALE64X2]], i32 0
; CHECK-NEXT: %[[TMP4:.*]] = shufflevector <vscale x 2 x i64> %[[TMP3]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: %[[TMP5:.*]] = add <vscale x 2 x i64> %[[TMP4]], %[[STEPVEC]]
; CHECK-NEXT: %[[VECIND2:.*]] = add <vscale x 2 x i64> %[[TMP2]], %[[TMP5]]
; CHECK-NEXT: %[[APTRS2:.*]] = getelementptr i32, i32* %a, <vscale x 2 x i64> %[[VECIND2]]
; CHECK-NEXT: %[[GEPB1:.*]] = getelementptr i32*, i32** %b, i64 %[[IDX]]
; CHECK: %[[BPTR1:.*]] = bitcast i32** %[[GEPB1]] to <vscale x 2 x i32*>*
; CHECK-NEXT: store <vscale x 2 x i32*> %[[APTRS1]], <vscale x 2 x i32*>* %[[BPTR1]], align 8
; CHECK: %[[VSCALE32:.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: %[[VSCALE32X2:.*]] = shl i32 %[[VSCALE32]], 1
; CHECK-NEXT: %[[TMP6:.*]] = sext i32 %[[VSCALE32X2]] to i64
; CHECK-NEXT: %[[GEPB2:.*]] = getelementptr i32*, i32** %[[GEPB1]], i64 %[[TMP6]]
; CHECK-NEXT: %[[BPTR2:.*]] = bitcast i32** %[[GEPB2]] to <vscale x 2 x i32*>*
; CHECK-NEXT store <vscale x 2 x i32*> %[[APTRS2]], <vscale x 2 x i32*>* %[[BPTR2]], align 8
entry:
br label %for.body
for.body:
%i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
%p = phi i32* [ %tmp3, %for.body ], [ %a, %entry ]
%q = phi i32** [ %tmp4, %for.body ], [ %b, %entry ]
%tmp0 = phi i32 [ %tmp2, %for.body ], [ 0, %entry ]
%tmp1 = load i32, i32* %p, align 8
%tmp2 = add i32 %tmp1, %tmp0
store i32* %p, i32** %q, align 8
%tmp3 = getelementptr inbounds i32, i32* %p, i32 1
%tmp4 = getelementptr inbounds i32*, i32** %q, i32 1
%i.next = add nuw nsw i64 %i, 1
%cond = icmp slt i64 %i.next, %n
br i1 %cond, label %for.body, label %for.end, !llvm.loop !6
for.end:
%tmp5 = phi i32 [ %tmp2, %for.body ]
ret i32 %tmp5
}
!0 = distinct !{!0, !1, !2, !3, !4, !5}
!1 = !{!"llvm.loop.mustprogress"}
!2 = !{!"llvm.loop.vectorize.width", i32 4}
!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
!4 = !{!"llvm.loop.vectorize.enable", i1 true}
!5 = !{!"llvm.loop.interleave.count", i32 2}
!6 = distinct !{!6, !1, !7, !3, !4, !5}
!7 = !{!"llvm.loop.vectorize.width", i32 2}