1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 19:23:23 +01:00

Revert "[LV] Emit @llvm.get.active.mask for tail-folded loops"

This reverts commit 47650451738c821993c763356854b560a0f9f550
while I investigate the build bot failures.
This commit is contained in:
Sjoerd Meijer 2020-06-17 10:09:17 +01:00
parent fbe3876c2c
commit b7481f88bb
7 changed files with 22 additions and 120 deletions

View File

@ -1413,14 +1413,14 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
}
bool ARMTTIImpl::emitGetActiveLaneMask() const {
if (!ST->hasMVEIntegerOps() || DisableTailPredication)
if (!ST->hasMVEIntegerOps())
return false;
// Intrinsic @llvm.get.active.lane.mask is supported.
// TODO: Intrinsic @llvm.get.active.lane.mask is supported.
// It is used in the MVETailPredication pass, which requires the number of
// elements processed by this vector loop to setup the tail-predicated
// loop.
return true;
return false;
}
void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP) {

View File

@ -6829,11 +6829,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
IV = IVRecipe->getVPValue();
}
VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
bool TailFolded = !CM.isScalarEpilogueAllowed();
if (TailFolded && CM.TTI.emitGetActiveLaneMask())
BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC});
else
BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
return BlockMaskCache[BB] = BlockMask;
}

View File

@ -380,20 +380,6 @@ void VPInstruction::generateInstruction(VPTransformState &State,
State.set(this, V, Part);
break;
}
case VPInstruction::ActiveLaneMask: {
// Get first lane of vector induction variable.
Value *VIVElem0 = State.get(getOperand(0), {Part, 0});
// Get first lane of backedge-taken-count.
Value *ScalarBTC = State.get(getOperand(1), {Part, 0});
auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
auto *PredTy = VectorType::get(Int1Ty, State.VF);
Instruction *Call = Builder.CreateIntrinsic(
Intrinsic::get_active_lane_mask, {PredTy, ScalarBTC->getType()},
{VIVElem0, ScalarBTC}, nullptr, "active.lane.mask");
State.set(this, Call, Part);
break;
}
default:
llvm_unreachable("Unsupported opcode for instruction");
}
@ -435,10 +421,6 @@ void VPInstruction::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
case VPInstruction::SLPStore:
O << "combined store";
break;
case VPInstruction::ActiveLaneMask:
O << "active lane mask";
break;
default:
O << Instruction::getOpcodeName(getOpcode());
}

View File

@ -685,7 +685,6 @@ public:
ICmpULE,
SLPLoad,
SLPStore,
ActiveLaneMask,
};
private:

View File

@ -45,12 +45,9 @@
define void @prefer_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
; CHECK-LABEL: prefer_folding(
; PREFER-FOLDING: vector.body:
; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 430)
; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask,
; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask,
; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
;
; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
@ -510,13 +507,9 @@ for.body:
define void @float(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
; CHECK-LABEL: float(
; PREFER-FOLDING: vector.body:
; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 430)
; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask
; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask
; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32({{.*}}%active.lane.mask
; PREFER-FOLDING: %index.next = add i32 %index, 4
; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32
; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32
; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32
; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
entry:
br label %for.body

View File

@ -15,13 +15,9 @@ target triple = "thumbv8.1m.main-arm-unknown-eabihf"
define dso_local void @sgt_loopguard(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) local_unnamed_addr #0 {
; COMMON-LABEL: @sgt_loopguard(
; COMMON: vector.body:
; CHECK-TF: %[[VIVELEM0:.*]] = extractelement <16 x i32> %vec.iv, i32 0
; CHECK-TF: %[[SCALARBTC:.*]] = extractelement <16 x i32> %broadcast.splat, i32 0
; CHECK-TF: %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %[[VIVELEM0]], i32 %[[SCALARBTC]])
; CHECK-TF: llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask
; CHECK-TF: llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask
; CHECK-TF: llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask)
; CHECK-TF: masked.load
; CHECK-TF: masked.load
; CHECK-TF: masked.store
entry:
%cmp5 = icmp sgt i32 %N, 0
br i1 %cmp5, label %while.body.preheader, label %while.end

View File

@ -41,15 +41,11 @@ for.body:
define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
; COMMON-LABEL: tail_folding_enabled(
; COMMON: vector.body:
; COMMON: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; COMMON: %[[ELEM0:.*]] = add i64 %index, 0
; COMMON: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 429)
; COMMON: %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %active.lane.mask
; COMMON: %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %active.lane.mask
; COMMON: %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
; COMMON: %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
; COMMON: %[[ADD:.*]] = add nsw <4 x i32> %[[WML2]], %[[WML1]]
; COMMON: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %[[ADD]], {{.*}}<4 x i1> %active.lane.mask
; COMMON: %index.next = add i64 %index, 4
; COMMON: br i1 %{{.*}}, label %{{.*}}, label %vector.body
; COMMON: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %[[ADD]]
; COMMON: br i1 %12, label %{{.*}}, label %vector.body
entry:
br label %for.body
@ -79,16 +75,13 @@ define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noa
; PREDFLAG-LABEL: tail_folding_disabled(
; PREDFLAG: vector.body:
; PREDFLAG: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; PREDFLAG: %[[ELEM0:.*]] = add i64 %index, 0
; PREDFLAG: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 429)
; PREDFLAG: %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
; PREDFLAG: %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
; PREDFLAG: %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
; PREDFLAG: %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
; PREDFLAG: %{{.*}} = add nsw <4 x i32> %wide.masked.load1, %wide.masked.load
; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32(
; PREDFLAG: %index.next = add i64 %index, 4
; PREDFLAG: %[[CMP:.*]] = icmp eq i64 %index.next, 432
; PREDFLAG: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !6
; PREDFLAG: %12 = icmp eq i64 %index.next, 432
; PREDFLAG: br i1 %{{.*}}, label %middle.block, label %vector.body, !llvm.loop !6
entry:
br label %for.body
@ -109,59 +102,6 @@ for.body:
br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
}
define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
; PREDFLAG-LABEL: interleave4(
; PREDFLAG: %[[ADD1:.*]] = add i32 %index, 0
; PREDFLAG: %[[ADD2:.*]] = add i32 %index, 4
; PREDFLAG: %[[ADD3:.*]] = add i32 %index, 8
; PREDFLAG: %[[ADD4:.*]] = add i32 %index, 12
; PREDFLAG: %[[BTC:.*]] = extractelement <4 x i32> %broadcast.splat, i32 0
; PREDFLAG: %[[ALM1:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD1]], i32 %[[BTC]])
; PREDFLAG: %[[ALM2:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD2]], i32 %[[BTC]])
; PREDFLAG: %[[ALM3:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD3]], i32 %[[BTC]])
; PREDFLAG: %[[ALM4:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD4]], i32 %[[BTC]])
;
; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]],{{.*}}
; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]],{{.*}}
; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]],{{.*}}
; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]],{{.*}}
; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]],{{.*}}
; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]],{{.*}}
; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]],{{.*}}
; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]],{{.*}}
;
; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]])
; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]])
; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]])
; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]])
;
entry:
%cmp8 = icmp sgt i32 %N, 0
br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
br label %for.body
for.cond.cleanup.loopexit: ; preds = %for.body
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
ret void
for.body: ; preds = %for.body.preheader, %for.body
%i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
%0 = load i32, i32* %arrayidx, align 4
%arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
%1 = load i32, i32* %arrayidx1, align 4
%add = add nsw i32 %1, %0
%arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
store i32 %add, i32* %arrayidx2, align 4
%inc = add nuw nsw i32 %i.09, 1
%exitcond = icmp eq i32 %inc, %N
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !14
}
; CHECK: !0 = distinct !{!0, !1}
; CHECK-NEXT: !1 = !{!"llvm.loop.isvectorized", i32 1}
; CHECK-NEXT: !2 = distinct !{!2, !3, !1}
@ -169,7 +109,6 @@ for.body: ; preds = %for.body.preheader,
; CHECK-NEXT: !4 = distinct !{!4, !1}
; CHECK-NEXT: !5 = distinct !{!5, !3, !1}
; CHECK-NEXT: !6 = distinct !{!6, !1}
attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" }
!6 = distinct !{!6, !7, !8}
@ -179,6 +118,3 @@ attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+
!10 = distinct !{!10, !11, !12}
!11 = !{!"llvm.loop.vectorize.predicate.enable", i1 false}
!12 = !{!"llvm.loop.vectorize.enable", i1 true}
!14 = distinct !{!14, !15}
!15 = !{!"llvm.loop.interleave.count", i32 4}