[VPlan] Add first VPlan version of sinkScalarOperands.

This patch adds a first VPlan-based implementation of sinking of scalar operands. The current version traverse a VPlan once and processes all operands of a predicated REPLICATE recipe. If one of those operands can be sunk, it is moved to the block containing the predicated REPLICATE recipe. Continue with processing the operands of the sunk recipe. The initial version does not re-process candidates after other recipes have been sunk. It also cannot partially sink induction increments at the moment. The VPlan only contains WIDEN-INDUCTION recipes and if the induction is used for example in a GEP, only the first lane is used and in the lowered IR the adds for the other lanes can be sunk into the predicated blocks. Reviewed By: Ayal Differential Revision: https://reviews.llvm.org/D100258
2024-10-19 02:52:53 +02:00 · 2021-05-24 14:14:08 +01:00 · 2021-05-24 14:14:08 +01:00 · 45eacd3210
commit 45eacd3210
parent 6129236a8f
8 changed files with 108 additions and 45 deletions
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -4599,12 +4599,22 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
    while (!Worklist.empty()) {
      auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());

-      // We can't sink an instruction if it is a phi node, is already in the
-      // predicated block, is not in the loop, or may have side effects.
-      if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
-          !VectorLoop->contains(I) || I->mayHaveSideEffects())
+      // We can't sink an instruction if it is a phi node, is not in the loop,
+      // or may have side effects.
+      if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
+          I->mayHaveSideEffects())
        continue;

+      // If the instruction is already in PredBB, check if we can sink its
+      // operands. In that case, VPlan's sinkScalarOperands() succeeded in
+      // sinking the scalar instruction I, hence it appears in PredBB; but it
+      // may have failed to sink I's operands (recursively), which we try
+      // (again) here.
+      if (I->getParent() == PredBB) {
+        Worklist.insert(I->op_begin(), I->op_end());
+        continue;
+      }
+
      // It's legal to sink the instruction if all its uses occur in the
      // predicated block. Otherwise, there's nothing to do yet, and we may
      // need to reanalyze the instruction.
@ -9245,6 +9255,8 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
    }
  }

+  VPlanTransforms::sinkScalarOperands(*Plan);
+
  std::string PlanName;
  raw_string_ostream RSO(PlanName);
  ElementCount VF = Range.Start;
--- a/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/lib/Transforms/Vectorize/VPlanTransforms.cpp
@ -99,3 +99,52 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
    }
  }
 }
+
+bool VPlanTransforms::sinkScalarOperands(VPlan &Plan) {
+  auto Iter = depth_first(
+      VPBlockRecursiveTraversalWrapper<VPBlockBase *>(Plan.getEntry()));
+  bool Changed = false;
+  // First, collect the operands of all predicated replicate recipes as seeds
+  // for sinking.
+  SetVector<VPValue *> WorkList;
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
+    for (auto &Recipe : *VPBB) {
+      auto *RepR = dyn_cast<VPReplicateRecipe>(&Recipe);
+      if (!RepR || !RepR->isPredicated())
+        continue;
+      WorkList.insert(RepR->op_begin(), RepR->op_end());
+    }
+  }
+
+  // Try to sink each replicate recipe in the worklist.
+  while (!WorkList.empty()) {
+    auto *C = WorkList.pop_back_val();
+    auto *SinkCandidate = dyn_cast_or_null<VPReplicateRecipe>(C->Def);
+    if (!SinkCandidate)
+      continue;
+
+    // All users of SinkCandidate must be in the same block in order to perform
+    // sinking. Therefore the destination block for sinking must match the block
+    // containing the first user.
+    auto *FirstUser = dyn_cast<VPRecipeBase>(*SinkCandidate->user_begin());
+    if (!FirstUser)
+      continue;
+    VPBasicBlock *SinkTo = FirstUser->getParent();
+    if (SinkCandidate->getParent() == SinkTo ||
+        SinkCandidate->mayHaveSideEffects() ||
+        SinkCandidate->mayReadOrWriteMemory())
+      continue;
+
+    // All recipe users of the sink candidate must be in the same block SinkTo.
+    if (any_of(SinkCandidate->users(), [SinkTo](VPUser *U) {
+          auto *UI = dyn_cast<VPRecipeBase>(U);
+          return !UI || UI->getParent() != SinkTo;
+        }))
+      continue;
+
+    SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
+    WorkList.insert(SinkCandidate->op_begin(), SinkCandidate->op_end());
+    Changed = true;
+  }
+  return Changed;
+}
--- a/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/lib/Transforms/Vectorize/VPlanTransforms.h
@ -28,6 +28,8 @@ struct VPlanTransforms {
      Loop *OrigLoop, VPlanPtr &Plan,
      LoopVectorizationLegality::InductionList &Inductions,
      SmallPtrSetImpl<Instruction *> &DeadInstructions, ScalarEvolution &SE);
+
+  static bool sinkScalarOperands(VPlan &Plan);
 };

 } // namespace llvm
--- a/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/test/Transforms/LoopVectorize/X86/small-size.ll
@ -567,40 +567,40 @@ define void @example23c(i16* noalias nocapture %src, i32* noalias nocapture %dst
 ; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <4 x i1> [[TMP1]], i32 0
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = zext i16 [[TMP4]] to i32
 ; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw i32 [[TMP18]], 7
-; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i32, i32* [[DST:%.*]], i64 [[INDEX]]
 ; CHECK-NEXT:    store i32 [[TMP19]], i32* [[NEXT_GEP7]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
 ; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x i1> [[TMP1]], i32 1
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[PRED_STORE_IF17:%.*]], label [[PRED_STORE_CONTINUE18:%.*]]
 ; CHECK:       pred.store.if17:
-; CHECK-NEXT:    [[TMP21:%.*]] = zext i16 [[TMP8]] to i32
-; CHECK-NEXT:    [[TMP22:%.*]] = shl nuw nsw i32 [[TMP21]], 7
-; CHECK-NEXT:    [[TMP23:%.*]] = or i64 [[INDEX]], 1
-; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP23]]
-; CHECK-NEXT:    store i32 [[TMP22]], i32* [[NEXT_GEP8]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = or i64 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP21]]
+; CHECK-NEXT:    [[TMP22:%.*]] = zext i16 [[TMP8]] to i32
+; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 7
+; CHECK-NEXT:    store i32 [[TMP23]], i32* [[NEXT_GEP8]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE18]]
 ; CHECK:       pred.store.continue18:
 ; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
 ; CHECK-NEXT:    br i1 [[TMP24]], label [[PRED_STORE_IF19:%.*]], label [[PRED_STORE_CONTINUE20:%.*]]
 ; CHECK:       pred.store.if19:
-; CHECK-NEXT:    [[TMP25:%.*]] = zext i16 [[TMP12]] to i32
-; CHECK-NEXT:    [[TMP26:%.*]] = shl nuw nsw i32 [[TMP25]], 7
-; CHECK-NEXT:    [[TMP27:%.*]] = or i64 [[INDEX]], 2
-; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP27]]
-; CHECK-NEXT:    store i32 [[TMP26]], i32* [[NEXT_GEP9]], align 4
+; CHECK-NEXT:    [[TMP25:%.*]] = or i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP9:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP25]]
+; CHECK-NEXT:    [[TMP26:%.*]] = zext i16 [[TMP12]] to i32
+; CHECK-NEXT:    [[TMP27:%.*]] = shl nuw nsw i32 [[TMP26]], 7
+; CHECK-NEXT:    store i32 [[TMP27]], i32* [[NEXT_GEP9]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE20]]
 ; CHECK:       pred.store.continue20:
 ; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x i1> [[TMP1]], i32 3
 ; CHECK-NEXT:    br i1 [[TMP28]], label [[PRED_STORE_IF21:%.*]], label [[PRED_STORE_CONTINUE22]]
 ; CHECK:       pred.store.if21:
-; CHECK-NEXT:    [[TMP29:%.*]] = zext i16 [[TMP16]] to i32
-; CHECK-NEXT:    [[TMP30:%.*]] = shl nuw nsw i32 [[TMP29]], 7
-; CHECK-NEXT:    [[TMP31:%.*]] = or i64 [[INDEX]], 3
-; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP31]]
-; CHECK-NEXT:    store i32 [[TMP30]], i32* [[NEXT_GEP10]], align 4
+; CHECK-NEXT:    [[TMP29:%.*]] = or i64 [[INDEX]], 3
+; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i32, i32* [[DST]], i64 [[TMP29]]
+; CHECK-NEXT:    [[TMP30:%.*]] = zext i16 [[TMP16]] to i32
+; CHECK-NEXT:    [[TMP31:%.*]] = shl nuw nsw i32 [[TMP30]], 7
+; CHECK-NEXT:    store i32 [[TMP31]], i32* [[NEXT_GEP10]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE22]]
 ; CHECK:       pred.store.continue22:
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
--- a/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ b/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@ -16,7 +16,6 @@ define void @sink_replicate_region_1(i32 %x, i8* %ptr) optsize {
 ; CHECK-NEXT: Successor(s): loop.0

 ; CHECK:      loop.0:
-; CHECK-NEXT:   REPLICATE ir<%gep> = getelementptr ir<%ptr>, ir<%iv>
 ; CHECK-NEXT: Successor(s): pred.load

 ; CHECK:     <xVFxUF> pred.load: {
@ -26,6 +25,7 @@ define void @sink_replicate_region_1(i32 %x, i8* %ptr) optsize {
 ; CHECK-NEXT:   CondBit: vp<%3> (loop)

 ; CHECK:      pred.load.if:
+; CHECK-NEXT:     REPLICATE ir<%gep> = getelementptr ir<%ptr>, ir<%iv>
 ; CHECK-NEXT:     REPLICATE ir<%lv> = load ir<%gep> (S->V)
 ; CHECK-NEXT:   Successor(s): pred.load.continue

@ -106,8 +106,6 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, i32* %ptr) optsize {
 ; CHECK-NEXT: }

 ; CHECK:     loop.0.split:
-; CHECK-NEXT:   REPLICATE ir<%add> = add vp<%6>, ir<%recur.next>
-; CHECK-NEXT:   REPLICATE ir<%gep> = getelementptr ir<%ptr>, ir<%iv>
 ; CHECK-NEXT: Successor(s): pred.store

 ; CHECK:      <xVFxUF> pred.store: {
@ -117,6 +115,8 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, i32* %ptr) optsize {
 ; CHECK-NEXT:   CondBit: vp<%3> (loop)

 ; CHECK:      pred.store.if:
+; CHECK-NEXT:     REPLICATE ir<%add> = add vp<%6>, ir<%recur.next>
+; CHECK-NEXT:     REPLICATE ir<%gep> = getelementptr ir<%ptr>, ir<%iv>
 ; CHECK-NEXT:     REPLICATE store ir<%add>, ir<%gep>
 ; CHECK-NEXT:   Successor(s): pred.store.continue

--- a/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/test/Transforms/LoopVectorize/if-pred-stores.ll
@ -130,11 +130,11 @@ define i32 @test(i32* nocapture %f) #0 {
 ; VEC-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
 ; VEC-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
 ; VEC:       pred.store.if1:
-; VEC-NEXT:    [[TMP9:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
-; VEC-NEXT:    [[TMP10:%.*]] = add nsw i32 [[TMP9]], 20
-; VEC-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], 1
-; VEC-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[TMP11]]
-; VEC-NEXT:    store i32 [[TMP10]], i32* [[TMP12]], align 4
+; VEC-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 1
+; VEC-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[F]], i64 [[TMP9]]
+; VEC-NEXT:    [[TMP11:%.*]] = extractelement <2 x i32> [[WIDE_LOAD]], i32 1
+; VEC-NEXT:    [[TMP12:%.*]] = add nsw i32 [[TMP11]], 20
+; VEC-NEXT:    store i32 [[TMP12]], i32* [[TMP10]], align 4
 ; VEC-NEXT:    br label [[PRED_STORE_CONTINUE2]]
 ; VEC:       pred.store.continue2:
 ; VEC-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
@ -565,12 +565,12 @@ define void @minimal_bit_widths(i1 %c) {
 ; VEC-NEXT:    [[TMP9:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 1
 ; VEC-NEXT:    br i1 [[TMP9]], label [[PRED_STORE_IF2:%.*]], label [[PRED_STORE_CONTINUE3]]
 ; VEC:       pred.store.if2:
-; VEC-NEXT:    [[TMP10:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1
-; VEC-NEXT:    [[TMP11:%.*]] = zext i8 [[TMP10]] to i32
-; VEC-NEXT:    [[TMP12:%.*]] = trunc i32 [[TMP11]] to i8
-; VEC-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 1
-; VEC-NEXT:    [[TMP14:%.*]] = getelementptr i8, i8* undef, i64 [[TMP13]]
-; VEC-NEXT:    store i8 [[TMP12]], i8* [[TMP14]], align 1
+; VEC-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 1
+; VEC-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* undef, i64 [[TMP10]]
+; VEC-NEXT:    [[TMP12:%.*]] = extractelement <2 x i8> [[WIDE_LOAD]], i32 1
+; VEC-NEXT:    [[TMP13:%.*]] = zext i8 [[TMP12]] to i32
+; VEC-NEXT:    [[TMP14:%.*]] = trunc i32 [[TMP13]] to i8
+; VEC-NEXT:    store i8 [[TMP14]], i8* [[TMP11]], align 1
 ; VEC-NEXT:    br label [[PRED_STORE_CONTINUE3]]
 ; VEC:       pred.store.continue3:
 ; VEC-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
--- a/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll
+++ b/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge-vf1.ll
@ -14,8 +14,6 @@ define void @sink_with_sideeffects(i1 %c, i8* %ptr) {
 ; CHECK-NEXT:   CLONE ir<%tmp2> = getelementptr ir<%ptr>, ir<%tmp0>
 ; CHECK-NEXT:   CLONE ir<%tmp3> = load ir<%tmp2>
 ; CHECK-NEXT:   CLONE store ir<0>, ir<%tmp2>
-; CHECK-NEXT:   CLONE ir<%tmp4> = zext ir<%tmp3>
-; CHECK-NEXT:   CLONE ir<%tmp5> = trunc ir<%tmp4>
 ; CHECK-NEXT: Successor(s): if.then

 ; CHECK:      if.then:
@ -28,6 +26,8 @@ define void @sink_with_sideeffects(i1 %c, i8* %ptr) {
 ; CHECK-NEXT:   CondBit: ir<%c>

 ; CHECK:      pred.store.if:
+; CHECK-NEXT:   CLONE ir<%tmp4> = zext ir<%tmp3>
+; CHECK-NEXT:   CLONE ir<%tmp5> = trunc ir<%tmp4>
 ; CHECK-NEXT:   CLONE store ir<%tmp5>, ir<%tmp2>
 ; CHECK-NEXT:   Successor(s): pred.store.continue

--- a/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
+++ b/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
@ -14,7 +14,6 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK-NEXT: loop:
 ; CHECK-NEXT:   WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next
 ; CHECK-NEXT:   EMIT vp<%2> = icmp ule ir<%indvars.iv> vp<%0>
-; CHECK-NEXT:   REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%indvars.iv>
 ; CHECK-NEXT: Successor(s): pred.load

 ; CHECK:       <xVFxUF> pred.load: {
@ -24,6 +23,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK-NEXT:   CondBit: vp<%2> (loop)

 ; CHECK:       pred.load.if:
+; CHECK-NEXT:     REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%indvars.iv>
 ; CHECK-NEXT:     REPLICATE ir<%lv.b> = load ir<%gep.b>
 ; CHECK-NEXT:   Successor(s): pred.load.continue

@ -33,9 +33,6 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK-NEXT: }

 ; CHECK:      loop.0:
-; CHECK-NEXT:   REPLICATE ir<%add> = add vp<%5>, ir<10>
-; CHECK-NEXT:   REPLICATE ir<%mul> = mul ir<2>, ir<%add>
-; CHECK-NEXT:   REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%indvars.iv>
 ; CHECK-NEXT: Successor(s): pred.store

 ; CHECK:      <xVFxUF> pred.store: {
@ -45,6 +42,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK-NEXT:   CondBit: vp<%2> (loop)

 ; CHECK:      pred.store.if:
+; CHECK-NEXT:     REPLICATE ir<%add> = add vp<%5>, ir<10>
+; CHECK-NEXT:     REPLICATE ir<%mul> = mul ir<2>, ir<%add>
+; CHECK-NEXT:     REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%indvars.iv>
 ; CHECK-NEXT:     REPLICATE store ir<%mul>, ir<%gep.a>
 ; CHECK-NEXT:   Successor(s): pred.store.continue

@ -85,7 +85,6 @@ exit:
 ; CHECK-NEXT: loop:
 ; CHECK-NEXT:   WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next
 ; CHECK-NEXT:   EMIT vp<%2> = icmp ule ir<%indvars.iv> vp<%0>
-; CHECK-NEXT:   REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%indvars.iv>
 ; CHECK-NEXT: Successor(s): pred.load

 ; CHECK:      <xVFxUF> pred.load: {
@ -95,6 +94,7 @@ exit:
 ; CHECK-NEXT:   CondBit: vp<%2> (loop)

 ; CHECK:      pred.load.if:
+; CHECK-NEXT:     REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%indvars.iv>
 ; CHECK-NEXT:     REPLICATE ir<%lv.b> = load ir<%gep.b>
 ; CHECK-NEXT:   Successor(s): pred.load.continue

@ -104,9 +104,7 @@ exit:
 ; CHECK-NEXT: }

 ; CHECK:      loop.0:
-; CHECK-NEXT:   REPLICATE ir<%add> = add vp<%5>, ir<10>
 ; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%indvars.iv>, ir<2>
-; CHECK-NEXT:   REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%mul>
 ; CHECK-NEXT: Successor(s): pred.store

 ; CHECK:      <xVFxUF> pred.store: {
@ -116,6 +114,8 @@ exit:
 ; CHECK-NEXT:   CondBit: vp<%2> (loop)

 ; CHECK:       pred.store.if:
+; CHECK-NEXT:     REPLICATE ir<%add> = add vp<%5>, ir<10>
+; CHECK-NEXT:     REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%mul>
 ; CHECK-NEXT:     REPLICATE store ir<%add>, ir<%gep.a>
 ; CHECK-NEXT:   Successor(s): pred.store.continue

@ -156,7 +156,6 @@ exit:
 ; CHECK-NEXT: loop:
 ; CHECK-NEXT:   WIDEN-INDUCTION %indvars.iv = phi 0, %indvars.iv.next
 ; CHECK-NEXT:   EMIT vp<%2> = icmp ule ir<%indvars.iv> vp<%0>
-; CHECK-NEXT:   REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%indvars.iv>
 ; CHECK-NEXT: Successor(s): pred.load

 ; CHECK:      <xVFxUF> pred.load: {
@ -166,6 +165,7 @@ exit:
 ; CHECK-NEXT:   CondBit: vp<%2> (loop)

 ; CHECK:       pred.load.if:
+; CHECK-NEXT:     REPLICATE ir<%gep.b> = getelementptr ir<@b>, ir<0>, ir<%indvars.iv>
 ; CHECK-NEXT:     REPLICATE ir<%lv.b> = load ir<%gep.b> (S->V)
 ; CHECK-NEXT:   Successor(s): pred.load.continue

@ -177,7 +177,6 @@ exit:
 ; CHECK:      loop.0:
 ; CHECK-NEXT:   WIDEN ir<%add> = add vp<%5>, ir<10>
 ; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%indvars.iv>, ir<%add>
-; CHECK-NEXT:   REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%mul>
 ; CHECK-NEXT: Successor(s): pred.store

 ; CHECK:      <xVFxUF> pred.store: {
@ -187,6 +186,7 @@ exit:
 ; CHECK-NEXT:   CondBit: vp<%2> (loop)

 ; CHECK:      pred.store.if:
+; CHECK-NEXT:     REPLICATE ir<%gep.a> = getelementptr ir<@a>, ir<0>, ir<%mul>
 ; CHECK-NEXT:     REPLICATE store ir<%add>, ir<%gep.a>
 ; CHECK-NEXT:   Successor(s): pred.store.continue