Revert "[LV] Emit @llvm.get.active.mask for tail-folded loops"

This reverts commit 47650451738c821993c763356854b560a0f9f550 while I investigate the build bot failures.
2024-11-23 19:23:23 +01:00 · 2020-06-17 10:09:17 +01:00 · 2020-06-17 10:09:17 +01:00 · b7481f88bb
commit b7481f88bb
parent fbe3876c2c
7 changed files with 22 additions and 120 deletions
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@ -1413,14 +1413,14 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
 }

 bool ARMTTIImpl::emitGetActiveLaneMask() const {
-  if (!ST->hasMVEIntegerOps() || DisableTailPredication)
+  if (!ST->hasMVEIntegerOps())
    return false;

-  // Intrinsic @llvm.get.active.lane.mask is supported.
+  // TODO: Intrinsic @llvm.get.active.lane.mask is supported.
  // It is used in the MVETailPredication pass, which requires the number of
  // elements processed by this vector loop to setup the tail-predicated
  // loop.
-  return true;
+  return false;
 }
 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                         TTI::UnrollingPreferences &UP) {
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -6829,11 +6829,7 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
      IV = IVRecipe->getVPValue();
    }
    VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
-    bool TailFolded = !CM.isScalarEpilogueAllowed();
-    if (TailFolded && CM.TTI.emitGetActiveLaneMask())
-      BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC});
-    else
-      BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
+    BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
    return BlockMaskCache[BB] = BlockMask;
  }

--- a/lib/Transforms/Vectorize/VPlan.cpp
+++ b/lib/Transforms/Vectorize/VPlan.cpp
@ -380,20 +380,6 @@ void VPInstruction::generateInstruction(VPTransformState &State,
    State.set(this, V, Part);
    break;
  }
-  case VPInstruction::ActiveLaneMask: {
-    // Get first lane of vector induction variable.
-    Value *VIVElem0 = State.get(getOperand(0), {Part, 0});
-    // Get first lane of backedge-taken-count.
-    Value *ScalarBTC = State.get(getOperand(1), {Part, 0});
-
-    auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
-    auto *PredTy = VectorType::get(Int1Ty, State.VF);
-    Instruction *Call = Builder.CreateIntrinsic(
-        Intrinsic::get_active_lane_mask, {PredTy, ScalarBTC->getType()},
-        {VIVElem0, ScalarBTC}, nullptr, "active.lane.mask");
-    State.set(this, Call, Part);
-    break;
-  }
  default:
    llvm_unreachable("Unsupported opcode for instruction");
  }
@ -435,10 +421,6 @@ void VPInstruction::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
  case VPInstruction::SLPStore:
    O << "combined store";
    break;
-  case VPInstruction::ActiveLaneMask:
-    O << "active lane mask";
-    break;
-
  default:
    O << Instruction::getOpcodeName(getOpcode());
  }
--- a/lib/Transforms/Vectorize/VPlan.h
+++ b/lib/Transforms/Vectorize/VPlan.h
@ -685,7 +685,6 @@ public:
    ICmpULE,
    SLPLoad,
    SLPStore,
-    ActiveLaneMask,
  };

 private:
--- a/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll
+++ b/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll
@ -45,12 +45,9 @@
 define void @prefer_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
 ; CHECK-LABEL:    prefer_folding(
 ; PREFER-FOLDING: vector.body:
-; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
-; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 430)
-; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask,
-; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask,
-; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
+; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
+; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
+; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
 ; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
 ;
 ; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
@ -510,13 +507,9 @@ for.body:
 define void @float(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
 ; CHECK-LABEL:    float(
 ; PREFER-FOLDING: vector.body:
-; PREFER-FOLDING: %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; PREFER-FOLDING: %[[VIVELEM0:.*]] = add i32 %index, 0
-; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[VIVELEM0]], i32 430)
-; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask
-; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask
-; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32({{.*}}%active.lane.mask
-; PREFER-FOLDING: %index.next = add i32 %index, 4
+; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32
+; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32
+; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32
 ; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
 entry:
  br label %for.body
--- a/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
+++ b/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
@ -15,13 +15,9 @@ target triple = "thumbv8.1m.main-arm-unknown-eabihf"
 define dso_local void @sgt_loopguard(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) local_unnamed_addr #0 {
 ; COMMON-LABEL: @sgt_loopguard(
 ; COMMON:       vector.body:
-
-; CHECK-TF:     %[[VIVELEM0:.*]] = extractelement <16 x i32> %vec.iv, i32 0
-; CHECK-TF:     %[[SCALARBTC:.*]] = extractelement <16 x i32> %broadcast.splat, i32 0
-; CHECK-TF:     %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %[[VIVELEM0]], i32 %[[SCALARBTC]])
-; CHECK-TF:     llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask
-; CHECK-TF:     llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask
-; CHECK-TF:     llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask)
+; CHECK-TF:     masked.load
+; CHECK-TF:     masked.load
+; CHECK-TF:     masked.store
 entry:
  %cmp5 = icmp sgt i32 %N, 0
  br i1 %cmp5, label %while.body.preheader, label %while.end
--- a/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
+++ b/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
@ -41,15 +41,11 @@ for.body:
 define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
 ; COMMON-LABEL: tail_folding_enabled(
 ; COMMON: vector.body:
-; COMMON:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; COMMON:   %[[ELEM0:.*]] = add i64 %index, 0
-; COMMON:   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 429)
-; COMMON:   %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %active.lane.mask
-; COMMON:   %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %active.lane.mask
+; COMMON:   %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
+; COMMON:   %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
 ; COMMON:   %[[ADD:.*]] = add nsw <4 x i32> %[[WML2]], %[[WML1]]
-; COMMON:   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %[[ADD]], {{.*}}<4 x i1> %active.lane.mask
-; COMMON:   %index.next = add i64 %index, 4
-; COMMON:   br i1 %{{.*}}, label %{{.*}}, label %vector.body
+; COMMON:   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %[[ADD]]
+; COMMON:   br i1 %12, label %{{.*}}, label %vector.body
 entry:
  br label %for.body

@ -79,16 +75,13 @@ define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noa

 ; PREDFLAG-LABEL: tail_folding_disabled(
 ; PREDFLAG:  vector.body:
-; PREDFLAG:  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; PREDFLAG:  %[[ELEM0:.*]] = add i64 %index, 0
-; PREDFLAG:  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 429)
-; PREDFLAG:  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
-; PREDFLAG:  %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
+; PREDFLAG:  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
+; PREDFLAG:  %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
 ; PREDFLAG:  %{{.*}} = add nsw <4 x i32> %wide.masked.load1, %wide.masked.load
-; PREDFLAG:  call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
+; PREDFLAG:  call void @llvm.masked.store.v4i32.p0v4i32(
 ; PREDFLAG:  %index.next = add i64 %index, 4
-; PREDFLAG:  %[[CMP:.*]] = icmp eq i64 %index.next, 432
-; PREDFLAG:  br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !6
+; PREDFLAG:  %12 = icmp eq i64 %index.next, 432
+; PREDFLAG:  br i1 %{{.*}}, label %middle.block, label %vector.body, !llvm.loop !6
 entry:
  br label %for.body

@ -109,59 +102,6 @@ for.body:
  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
 }

-define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
-; PREDFLAG-LABEL: interleave4(
-; PREDFLAG:  %[[ADD1:.*]] = add i32 %index, 0
-; PREDFLAG:  %[[ADD2:.*]] = add i32 %index, 4
-; PREDFLAG:  %[[ADD3:.*]] = add i32 %index, 8
-; PREDFLAG:  %[[ADD4:.*]] = add i32 %index, 12
-; PREDFLAG:  %[[BTC:.*]] = extractelement <4 x i32> %broadcast.splat, i32 0
-; PREDFLAG:  %[[ALM1:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD1]], i32 %[[BTC]])
-; PREDFLAG:  %[[ALM2:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD2]], i32 %[[BTC]])
-; PREDFLAG:  %[[ALM3:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD3]], i32 %[[BTC]])
-; PREDFLAG:  %[[ALM4:active.lane.mask.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ADD4]], i32 %[[BTC]])
-;
-; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]],{{.*}}
-; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]],{{.*}}
-; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]],{{.*}}
-; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]],{{.*}}
-; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]],{{.*}}
-; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]],{{.*}}
-; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]],{{.*}}
-; PREDFLAG:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]],{{.*}}
-;
-; PREDFLAG:  call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM1]])
-; PREDFLAG:  call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM2]])
-; PREDFLAG:  call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM3]])
-; PREDFLAG:  call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ALM4]])
-;
-entry:
-  %cmp8 = icmp sgt i32 %N, 0
-  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
-
-for.body.preheader:                               ; preds = %entry
-  br label %for.body
-
-for.cond.cleanup.loopexit:                        ; preds = %for.body
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  ret void
-
-for.body:                                         ; preds = %for.body.preheader, %for.body
-  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
-  %arrayidx = getelementptr inbounds i32, i32* %B, i32 %i.09
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %C, i32 %i.09
-  %1 = load i32, i32* %arrayidx1, align 4
-  %add = add nsw i32 %1, %0
-  %arrayidx2 = getelementptr inbounds i32, i32* %A, i32 %i.09
-  store i32 %add, i32* %arrayidx2, align 4
-  %inc = add nuw nsw i32 %i.09, 1
-  %exitcond = icmp eq i32 %inc, %N
-  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !14
-}
-
 ; CHECK:      !0 = distinct !{!0, !1}
 ; CHECK-NEXT: !1 = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK-NEXT: !2 = distinct !{!2, !3, !1}
@ -169,7 +109,6 @@ for.body:                                         ; preds = %for.body.preheader,
 ; CHECK-NEXT: !4 = distinct !{!4, !1}
 ; CHECK-NEXT: !5 = distinct !{!5, !3, !1}
 ; CHECK-NEXT: !6 = distinct !{!6, !1}
-
 attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" }

 !6 = distinct !{!6, !7, !8}
@ -179,6 +118,3 @@ attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+
 !10 = distinct !{!10, !11, !12}
 !11 = !{!"llvm.loop.vectorize.predicate.enable", i1 false}
 !12 = !{!"llvm.loop.vectorize.enable", i1 true}
-
-!14 = distinct !{!14, !15}
-!15 = !{!"llvm.loop.interleave.count", i32 4}