[LV] Avoid emitting trivially dead instructions

Some instructions from the original loop, when vectorized, can become trivially dead. This happens because of the way we structure the new loop. For example, we create new induction variables and induction variable "steps" in the new loop. Thus, when we go to vectorize the original induction variable update, it may no longer be needed due to the instructions we've already created. This patch prevents us from creating these redundant instructions. This reduces code size before simplification and allows greater flexibility in code generation since we have fewer unnecessary instruction uses. Differential Revision: https://reviews.llvm.org/D25631 llvm-svn: 284631
2025-01-31 20:51:52 +01:00 · 2016-10-19 19:22:02 +00:00 · 2016-10-19 19:22:02 +00:00 · f64cb1c3a7
commit f64cb1c3a7
parent d8727d0208
2 changed files with 87 additions and 0 deletions
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -441,6 +441,10 @@ protected:
  /// respective conditions.
  void predicateInstructions();

+  /// Collect the instructions from the original loop that would be trivially
+  /// dead in the vectorized loop if generated.
+  void collectTriviallyDeadInstructions();
+
  /// Shrinks vector element sizes to the smallest bitwidth they can be legally
  /// represented as.
  void truncateToMinimalBitwidths();
@ -763,6 +767,14 @@ protected:

  // Record whether runtime checks are added.
  bool AddedSafetyChecks;
+
+  // Holds instructions from the original loop whose counterparts in the
+  // vectorized loop would be trivially dead if generated. For example,
+  // original induction update instructions can become dead because we
+  // separately emit induction "steps" when generating code for the new loop.
+  // Similarly, we create a new latch condition when setting up the structure
+  // of the new loop, so the old one can become dead.
+  SmallPtrSet<Instruction *, 4> DeadInstructions;
 };

 class InnerLoopUnroller : public InnerLoopVectorizer {
@ -3802,6 +3814,11 @@ void InnerLoopVectorizer::vectorizeLoop() {
  // are vectorized, so we can use them to construct the PHI.
  PhiVector PHIsToFix;

+  // Collect instructions from the original loop that will become trivially
+  // dead in the vectorized loop. We don't need to vectorize these
+  // instructions.
+  collectTriviallyDeadInstructions();
+
  // Scan the loop in a topological order to ensure that defs are vectorized
  // before users.
  LoopBlocksDFS DFS(OrigLoop);
@ -4209,6 +4226,29 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {
  }
 }

+void InnerLoopVectorizer::collectTriviallyDeadInstructions() {
+  BasicBlock *Latch = OrigLoop->getLoopLatch();
+
+  // We create new control-flow for the vectorized loop, so the original
+  // condition will be dead after vectorization if it's only used by the
+  // branch.
+  auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
+  if (Cmp && Cmp->hasOneUse())
+    DeadInstructions.insert(Cmp);
+
+  // We create new "steps" for induction variable updates to which the original
+  // induction variables map. An original update instruction will be dead if
+  // all its users except the induction variable are dead.
+  for (auto &Induction : *Legal->getInductionVars()) {
+    PHINode *Ind = Induction.first;
+    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+    if (all_of(IndUpdate->users(), [&](User *U) -> bool {
+          return U == Ind || DeadInstructions.count(cast<Instruction>(U));
+        }))
+      DeadInstructions.insert(IndUpdate);
+  }
+}
+
 void InnerLoopVectorizer::predicateInstructions() {

  // For each instruction I marked for predication on value C, split I into its
@ -4536,6 +4576,11 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
  // For each instruction in the old loop.
  for (Instruction &I : *BB) {

+    // If the instruction will become trivially dead when vectorized, we don't
+    // need to generate it.
+    if (DeadInstructions.count(&I))
+      continue;
+
    // Scalarize instructions that should remain scalar after vectorization.
    if (!(isa<BranchInst>(&I) || isa<PHINode>(&I) ||
          isa<DbgInfoIntrinsic>(&I)) &&
--- a/test/Transforms/LoopVectorize/dead_instructions.ll
+++ b/test/Transforms/LoopVectorize/dead_instructions.ll
@ -0,0 +1,42 @@
+; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=2 -loop-vectorize -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: @dead_instructions_01
+;
+; This test ensures that we don't generate trivially dead instructions prior to
+; instruction simplification. We don't need to generate instructions
+; corresponding to the original induction variable update or branch condition,
+; since we rewrite the loop structure.
+;
+; CHECK:     vector.body:
+; CHECK:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:       %[[I0:.+]] = add i64 %index, 0
+; CHECK:       %[[I2:.+]] = add i64 %index, 2
+; CHECK:       getelementptr inbounds i64, i64* %a, i64 %[[I0]]
+; CHECK:       getelementptr inbounds i64, i64* %a, i64 %[[I2]]
+; CHECK-NOT:   add nuw nsw i64 %[[I0]], 1
+; CHECK-NOT:   add nuw nsw i64 %[[I2]], 1
+; CHECK-NOT:   icmp slt i64 {{.*}}, %n
+; CHECK:       %index.next = add i64 %index, 4
+; CHECK:       %[[CMP:.+]] = icmp eq i64 %index.next, %n.vec
+; CHECK:       br i1 %[[CMP]], label %middle.block, label %vector.body
+;
+define i64 @dead_instructions_01(i64 *%a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %r = phi i64 [ %tmp2, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i64, i64* %a, i64 %i
+  %tmp1 = load i64, i64* %tmp0, align 8
+  %tmp2 = add i64 %tmp1, %r
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  %tmp3  = phi i64 [ %tmp2, %for.body ]
+  ret i64 %tmp3
+}