mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 19:23:23 +01:00
Revert "[LV] Emit @llvm.get.active.mask for tail-folded loops"
This reverts commit 47650451738c821993c763356854b560a0f9f550 while I investigate the build bot failures.
This commit is contained in:
parent
fbe3876c2c
commit
b7481f88bb
@ -1413,14 +1413,14 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
|
||||
}
|
||||
|
||||
bool ARMTTIImpl::emitGetActiveLaneMask() const {
|
||||
if (!ST->hasMVEIntegerOps() || DisableTailPredication)
|
||||
if (!ST->hasMVEIntegerOps())
|
||||
return false;
|
||||
|
||||
// Intrinsic @llvm.get.active.lane.mask is supported.
|
||||
// TODO: Intrinsic @llvm.get.active.lane.mask is supported.
|
||||
// It is used in the MVETailPredication pass, which requires the number of
|
||||
// elements processed by this vector loop to setup the tail-predicated
|
||||
// loop.
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
|
||||
TTI::UnrollingPreferences &UP) {
|
||||
|
@ -6829,11 +6829,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
|
||||
IV = IVRecipe->getVPValue();
|
||||
}
|
||||
VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
|
||||
bool TailFolded = !CM.isScalarEpilogueAllowed();
|
||||
if (TailFolded && CM.TTI.emitGetActiveLaneMask())
|
||||
BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC});
|
||||
else
|
||||
BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
|
||||
BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
|
||||
return BlockMaskCache[BB] = BlockMask;
|
||||
}
|
||||
|
||||
|
@ -380,20 +380,6 @@ void VPInstruction::generateInstruction(VPTransformState &State,
|
||||
State.set(this, V, Part);
|
||||
break;
|
||||
}
|
||||
case VPInstruction::ActiveLaneMask: {
|
||||
// Get first lane of vector induction variable.
|
||||
Value *VIVElem0 = State.get(getOperand(0), {Part, 0});
|
||||
// Get first lane of backedge-taken-count.
|
||||
Value *ScalarBTC = State.get(getOperand(1), {Part, 0});
|
||||
|
||||
auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
|
||||
auto *PredTy = VectorType::get(Int1Ty, State.VF);
|
||||
Instruction *Call = Builder.CreateIntrinsic(
|
||||
Intrinsic::get_active_lane_mask, {PredTy, ScalarBTC->getType()},
|
||||
{VIVElem0, ScalarBTC}, nullptr, "active.lane.mask");
|
||||
State.set(this, Call, Part);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
llvm_unreachable("Unsupported opcode for instruction");
|
||||
}
|
||||
@ -435,10 +421,6 @@ void VPInstruction::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
|
||||
case VPInstruction::SLPStore:
|
||||
O << "combined store";
|
||||
break;
|
||||
case VPInstruction::ActiveLaneMask:
|
||||
O << "active lane mask";
|
||||
break;
|
||||
|
||||
default:
|
||||
O << Instruction::getOpcodeName(getOpcode());
|
||||
}
|
||||
|
@ -685,7 +685,6 @@ public:
|
||||
ICmpULE,
|
||||
SLPLoad,
|
||||
SLPStore,
|
||||
ActiveLaneMask,
|
||||
};
|
||||
|
||||
private:
|
||||
|
@ -45,12 +45,9 @@
|
||||
define void @prefer_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
|
||||
; CHECK-LABEL: prefer_folding(
|
||||
; PREFER-FOLDING: vector.body:
|
||||
; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
|
||||
; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 430)
|
||||
; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask,
|
||||
; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask,
|
||||
; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
|
||||
; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
|
||||
; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
|
||||
; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
|
||||
; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
|
||||
;
|
||||
; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
|
||||
@ -510,13 +507,9 @@ for.body:
|
||||
define void @float(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
|
||||
; CHECK-LABEL: float(
|
||||
; PREFER-FOLDING: vector.body:
|
||||
; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
|
||||
; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 430)
|
||||
; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask
|
||||
; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask
|
||||
; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32({{.*}}%active.lane.mask
|
||||
; PREFER-FOLDING: %index.next = add i32 %index, 4
|
||||
; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32
|
||||
; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32
|
||||
; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32
|
||||
; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
|
||||
entry:
|
||||
br label %for.body
|
||||
|
@ -15,13 +15,9 @@ target triple = "thumbv8.1m.main-arm-unknown-eabihf"
|
||||
define dso_local void @sgt_loopguard(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) local_unnamed_addr #0 {
|
||||
; COMMON-LABEL: @sgt_loopguard(
|
||||
; COMMON: vector.body:
|
||||
|
||||
; CHECK-TF: %[[VIVELEM0:.*]] = extractelement <16 x i32> %vec.iv, i32 0
|
||||
; CHECK-TF: %[[SCALARBTC:.*]] = extractelement <16 x i32> %broadcast.splat, i32 0
|
||||
; CHECK-TF: %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %[[VIVELEM0]], i32 %[[SCALARBTC]])
|
||||
; CHECK-TF: llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask
|
||||
; CHECK-TF: llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask
|
||||
; CHECK-TF: llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask)
|
||||
; CHECK-TF: masked.load
|
||||
; CHECK-TF: masked.load
|
||||
; CHECK-TF: masked.store
|
||||
entry:
|
||||
%cmp5 = icmp sgt i32 %N, 0
|
||||
br i1 %cmp5, label %while.body.preheader, label %while.end
|
||||
|
@ -41,15 +41,11 @@ for.body:
|
||||
define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
|
||||
; COMMON-LABEL: tail_folding_enabled(
|
||||
; COMMON: vector.body:
|
||||
; COMMON: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
; COMMON: %[[ELEM0:.*]] = add i64 %index, 0
|
||||
; COMMON: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 429)
|
||||
; COMMON: %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %active.lane.mask
|
||||
; COMMON: %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %active.lane.mask
|
||||
; COMMON: %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
|
||||
; COMMON: %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
|
||||
; COMMON: %[[ADD:.*]] = add nsw <4 x i32> %[[WML2]], %[[WML1]]
|
||||
; COMMON: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %[[ADD]], {{.*}}<4 x i1> %active.lane.mask
|
||||
; COMMON: %index.next = add i64 %index, 4
|
||||
; COMMON: br i1 %{{.*}}, label %{{.*}}, label %vector.body
|
||||
; COMMON: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %[[ADD]]
|
||||
; COMMON: br i1 %12, label %{{.*}}, label %vector.body
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
@ -79,16 +75,13 @@ define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noa
|
||||
|
||||
; PREDFLAG-LABEL: tail_folding_disabled(
|
||||
; PREDFLAG: vector.body:
|
||||
; PREDFLAG: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
||||
; PREDFLAG: %[[ELEM0:.*]] = add i64 %index, 0
|
||||
; PREDFLAG: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 429)
|
||||
; PREDFLAG: %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
|
||||
; PREDFLAG: %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
|
||||
; PREDFLAG: %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
|
||||
; PREDFLAG: %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
|
||||
; PREDFLAG: %{{.*}} = add nsw <4 x i32> %wide.masked.load1, %wide.masked.load
|
||||
; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
|
||||
; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32(
|
||||
; PREDFLAG: %index.next = add i64 %index, 4
|
||||
; PREDFLAG: %[[CMP:.*]] = icmp eq i64 %index.next, 432
|
||||
; PREDFLAG: br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !6
|
||||
; PREDFLAG: %12 = icmp eq i64 %index.next, 432
|
||||
; PREDFLAG: br i1 %{{.*}}, label %middle.block, label %vector.body, !llvm.loop !6
|
||||
entry:
|
||||
br label %for.body
|
||||
|
||||
@ -109,59 +102,6 @@ for.body:
|
||||
br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
|
||||
}
|
||||
|
||||
define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
|
||||
; PREDFLAG-LABEL: interleave4(
|
||||
; PREDFLAG: %[[ADD1:.*]] = add i32 %index, 0
|
||||
; PREDFLAG: %[[ADD2:.*]] = add i32 %index, 4
|
||||
; PREDFLAG: %[[ADD3:.*]] = add i32 %index, 8
|
||||
; PREDFLAG: %[[ADD4:.*]] = add i32 %index, 12
|
||||
; PREDFLAG: %[[BTC:.*]] = extractelement <4 x i32> %broadcast.splat, i32 0
|
||||
; PREDFLAG: %[[ALM1:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD1]], i32 %[[BTC]])
|
||||
; PREDFLAG: %[[ALM2:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD2]], i32 %[[BTC]])
|
||||
; PREDFLAG: %[[ALM3:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD3]], i32 %[[BTC]])
|
||||
; PREDFLAG: %[[ALM4:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD4]], i32 %[[BTC]])
|
||||
;
|
||||
; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]],{{.*}}
|
||||
; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]],{{.*}}
|
||||
; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]],{{.*}}
|
||||
; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]],{{.*}}
|
||||
; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]],{{.*}}
|
||||
; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]],{{.*}}
|
||||
; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]],{{.*}}
|
||||
; PREDFLAG: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]],{{.*}}
|
||||
;
|
||||
; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]])
|
||||
; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]])
|
||||
; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]])
|
||||
; PREDFLAG: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]])
|
||||
;
|
||||
entry:
|
||||
%cmp8 = icmp sgt i32 %N, 0
|
||||
br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
|
||||
|
||||
for.body.preheader: ; preds = %entry
|
||||
br label %for.body
|
||||
|
||||
for.cond.cleanup.loopexit: ; preds = %for.body
|
||||
br label %for.cond.cleanup
|
||||
|
||||
for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
|
||||
ret void
|
||||
|
||||
for.body: ; preds = %for.body.preheader, %for.body
|
||||
%i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
|
||||
%arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
|
||||
%0 = load i32, i32* %arrayidx, align 4
|
||||
%arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
|
||||
%1 = load i32, i32* %arrayidx1, align 4
|
||||
%add = add nsw i32 %1, %0
|
||||
%arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
|
||||
store i32 %add, i32* %arrayidx2, align 4
|
||||
%inc = add nuw nsw i32 %i.09, 1
|
||||
%exitcond = icmp eq i32 %inc, %N
|
||||
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !14
|
||||
}
|
||||
|
||||
; CHECK: !0 = distinct !{!0, !1}
|
||||
; CHECK-NEXT: !1 = !{!"llvm.loop.isvectorized", i32 1}
|
||||
; CHECK-NEXT: !2 = distinct !{!2, !3, !1}
|
||||
@ -169,7 +109,6 @@ for.body: ; preds = %for.body.preheader,
|
||||
; CHECK-NEXT: !4 = distinct !{!4, !1}
|
||||
; CHECK-NEXT: !5 = distinct !{!5, !3, !1}
|
||||
; CHECK-NEXT: !6 = distinct !{!6, !1}
|
||||
|
||||
attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" }
|
||||
|
||||
!6 = distinct !{!6, !7, !8}
|
||||
@ -179,6 +118,3 @@ attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+
|
||||
!10 = distinct !{!10, !11, !12}
|
||||
!11 = !{!"llvm.loop.vectorize.predicate.enable", i1 false}
|
||||
!12 = !{!"llvm.loop.vectorize.enable", i1 true}
|
||||
|
||||
!14 = distinct !{!14, !15}
|
||||
!15 = !{!"llvm.loop.interleave.count", i32 4}
|
||||
|
Loading…
Reference in New Issue
Block a user