[LoopPredication] Enable predication when latchCheckIV is wider than rangeCheck

Summary: This patch allows us to predicate range checks that have a type narrower than the latch check type. We leverage SCEV analysis to identify a truncate for the latchLimit and latchStart. There is also safety checks in place which requires the start and limit to be known at compile time. We require this to make sure that the SCEV truncate expr for the IV corresponding to the latch does not cause us to lose information about the IV range. Added tests show the loop predication over range checks that are of various types and are narrower than the latch type. This enhancement has been in our downstream tree for a while. Reviewers: apilipenko, sanjoy, mkazantsev Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D39500 llvm-svn: 317269
2024-10-20 19:42:54 +02:00 · 2017-11-02 21:21:02 +00:00 · 2017-11-02 21:21:02 +00:00 · 93dc82e6f3
commit 93dc82e6f3
parent 2e3db2532b
2 changed files with 234 additions and 10 deletions
--- a/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/lib/Transforms/Scalar/LoopPredication.cpp
@ -174,6 +174,9 @@

 using namespace llvm;

+static cl::opt<bool> EnableIVTruncation("loop-predication-enable-iv-truncation",
+                                        cl::Hidden, cl::init(true));
+
 namespace {
 class LoopPredication {
  /// Represents an induction variable check:
@ -212,6 +215,22 @@ class LoopPredication {
                                        IRBuilder<> &Builder);
  bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander);

+  // When the IV type is wider than the range operand type, we can still do loop
+  // predication, by generating SCEVs for the range and latch that are of the
+  // same type. We achieve this by generating a SCEV truncate expression for the
+  // latch IV. This is done iff truncation of the IV is a safe operation,
+  // without loss of information.
+  // Another way to achieve this is by generating a wider type SCEV for the
+  // range check operand, however, this needs a more involved check that
+  // operands do not overflow. This can lead to loss of information when the
+  // range operand is of the form: add i32 %offset, %iv. We need to prove that
+  // sext(x + y) is same as sext(x) + sext(y).
+  // This function returns true if we can safely represent the IV type in
+  // the RangeCheckType without loss of information.
+  bool isSafeToTruncateWideIVType(Type *RangeCheckType);
+  // Return the loopLatchCheck corresponding to the RangeCheckType if safe to do
+  // so.
+  Optional<LoopICmp> generateLoopLatchCheck(Type *RangeCheckType);
 public:
  LoopPredication(ScalarEvolution *SE) : SE(SE){};
  bool runOnLoop(Loop *L);
@ -301,6 +320,34 @@ Value *LoopPredication::expandCheck(SCEVExpander &Expander,
  return Builder.CreateICmp(Pred, LHSV, RHSV);
 }

+Optional<LoopPredication::LoopICmp>
+LoopPredication::generateLoopLatchCheck(Type *RangeCheckType) {
+
+  auto *LatchType = LatchCheck.IV->getType();
+  if (RangeCheckType == LatchType)
+    return LatchCheck;
+  // For now, bail out if latch type is narrower than range type.
+  if (DL->getTypeSizeInBits(LatchType) < DL->getTypeSizeInBits(RangeCheckType))
+    return None;
+  if (!isSafeToTruncateWideIVType(RangeCheckType))
+    return None;
+  // We can now safely identify the truncated version of the IV and limit for
+  // RangeCheckType.
+  LoopICmp NewLatchCheck;
+  NewLatchCheck.Pred = LatchCheck.Pred;
+  NewLatchCheck.IV = dyn_cast<SCEVAddRecExpr>(
+      SE->getTruncateExpr(LatchCheck.IV, RangeCheckType));
+  if (!NewLatchCheck.IV)
+    return None;
+  NewLatchCheck.Limit = SE->getTruncateExpr(LatchCheck.Limit, RangeCheckType);
+  DEBUG(dbgs() << "IV of type: " << *LatchType
+               << "can be represented as range check type:" << *RangeCheckType
+               << "\n");
+  DEBUG(dbgs() << "LatchCheck.IV: " << *NewLatchCheck.IV << "\n");
+  DEBUG(dbgs() << "LatchCheck.Limit: " << *NewLatchCheck.Limit << "\n");
+  return NewLatchCheck;
+}
+
 /// If ICI can be widened to a loop invariant condition emits the loop
 /// invariant condition in the loop preheader and return it, otherwise
 /// returns None.
@ -325,22 +372,31 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
    return None;
  }
  auto *RangeCheckIV = RangeCheck->IV;
-  auto *Ty = RangeCheckIV->getType();
-  if (Ty != LatchCheck.IV->getType()) {
-    DEBUG(dbgs() << "Type mismatch between range check and latch IVs!\n");
-    return None;
-  }
  if (!RangeCheckIV->isAffine()) {
    DEBUG(dbgs() << "Range check IV is not affine!\n");
    return None;
  }
  auto *Step = RangeCheckIV->getStepRecurrence(*SE);
-  if (Step != LatchCheck.IV->getStepRecurrence(*SE)) {
+  // We cannot just compare with latch IV step because the latch and range IVs
+  // may have different types.
+  if (!Step->isOne()) {
    DEBUG(dbgs() << "Range check and latch have IVs different steps!\n");
    return None;
  }
-  assert(Step->isOne() && "must be one");
+  auto *Ty = RangeCheckIV->getType();
+  auto CurrLatchCheckOpt = generateLoopLatchCheck(Ty);
+  if (!CurrLatchCheckOpt) {
+    DEBUG(dbgs() << "Failed to generate a loop latch check "
+                    "corresponding to range type: "
+                 << *Ty << "\n");
+    return None;
+  }

+  LoopICmp CurrLatchCheck = *CurrLatchCheckOpt;
+  // At this point the range check step and latch step should have the same
+  // value and type.
+  assert(Step == CurrLatchCheck.IV->getStepRecurrence(*SE) &&
+         "Range and latch should have same step recurrence!");
  // Generate the widened condition:
  //   guardStart u< guardLimit &&
  //   latchLimit <pred> guardLimit - 1 - guardStart + latchStart
@ -348,8 +404,8 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
  // header comment for the reasoning.
  const SCEV *GuardStart = RangeCheckIV->getStart();
  const SCEV *GuardLimit = RangeCheck->Limit;
-  const SCEV *LatchStart = LatchCheck.IV->getStart();
-  const SCEV *LatchLimit = LatchCheck.Limit;
+  const SCEV *LatchStart = CurrLatchCheck.IV->getStart();
+  const SCEV *LatchLimit = CurrLatchCheck.Limit;

  // guardLimit - guardStart + latchStart - 1
  const SCEV *RHS =
@ -357,7 +413,7 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
                     SE->getMinusSCEV(LatchStart, SE->getOne(Ty)));

  ICmpInst::Predicate LimitCheckPred;
-  switch (LatchCheck.Pred) {
+  switch (CurrLatchCheck.Pred) {
  case ICmpInst::ICMP_ULT:
    LimitCheckPred = ICmpInst::ICMP_ULE;
    break;
@ -510,6 +566,36 @@ Optional<LoopPredication::LoopICmp> LoopPredication::parseLoopLatchICmp() {
  return Result;
 }

+// Returns true if its safe to truncate the IV to RangeCheckType.
+bool LoopPredication::isSafeToTruncateWideIVType(Type *RangeCheckType) {
+  if (!EnableIVTruncation)
+    return false;
+  assert(DL->getTypeSizeInBits(LatchCheck.IV->getType()) >
+             DL->getTypeSizeInBits(RangeCheckType) &&
+         "Expected latch check IV type to be larger than range check operand "
+         "type!");
+  // The start and end values of the IV should be known. This is to guarantee
+  // that truncating the wide type will not lose information.
+  auto *Limit = dyn_cast<SCEVConstant>(LatchCheck.Limit);
+  auto *Start = dyn_cast<SCEVConstant>(LatchCheck.IV->getStart());
+  if (!Limit || !Start)
+    return false;
+  // This check makes sure that the IV does not change sign during loop
+  // iterations. Consider latchType = i64, LatchStart = 5, Pred = ICMP_SGE,
+  // LatchEnd = 2, rangeCheckType = i32. If it's not a monotonic predicate, the
+  // IV wraps around, and the truncation of the IV would lose the range of
+  // iterations between 2^32 and 2^64.
+  bool Increasing;
+  if (!SE->isMonotonicPredicate(LatchCheck.IV, LatchCheck.Pred, Increasing))
+    return false;
+  // The active bits should be less than the bits in the RangeCheckType. This
+  // guarantees that truncating the latch check to RangeCheckType is a safe
+  // operation.
+  auto RangeCheckTypeBitSize = DL->getTypeSizeInBits(RangeCheckType);
+  return Start->getAPInt().getActiveBits() < RangeCheckTypeBitSize &&
+         Limit->getAPInt().getActiveBits() < RangeCheckTypeBitSize;
+}
+
 bool LoopPredication::runOnLoop(Loop *Loop) {
  L = Loop;

--- a/test/Transforms/LoopPredication/widened.ll
+++ b/test/Transforms/LoopPredication/widened.ll
@ -0,0 +1,138 @@
+; RUN: opt -S -loop-predication -loop-predication-enable-iv-truncation=true < %s 2>&1 | FileCheck %s
+declare void @llvm.experimental.guard(i1, ...)
+
+declare i32 @length(i8*)
+
+declare i16 @short_length(i8*)
+; Consider range check of type i16 and i32, while IV is of type i64
+; We can loop predicate this because the IV range is within i16 and within i32.
+define i64 @iv_wider_type_rc_two_narrow_types(i32 %offA, i16 %offB, i8* %arrA, i8* %arrB) {
+; CHECK-LABEL: iv_wider_type_rc_two_narrow_types
+entry:
+; CHECK-LABEL: entry:
+; CHECK: [[idxB:[^ ]+]] = sub i16 %lengthB, %offB
+; CHECK-NEXT: [[limit_checkB:[^ ]+]] = icmp ule i16 16, [[idxB]]
+; CHECK-NEXT: [[first_iteration_checkB:[^ ]+]] = icmp ult i16 %offB, %lengthB
+; CHECK-NEXT: [[WideChkB:[^ ]+]] = and i1 [[first_iteration_checkB]], [[limit_checkB]]
+; CHECK-NEXT: [[idxA:[^ ]+]] = sub i32 %lengthA, %offA
+; CHECK-NEXT: [[limit_checkA:[^ ]+]] = icmp ule i32 16, [[idxA]]
+; CHECK-NEXT: [[first_iteration_checkA:[^ ]+]] = icmp ult i32 %offA, %lengthA
+; CHECK-NEXT: [[WideChkA:[^ ]+]] = and i1 [[first_iteration_checkA]], [[limit_checkA]]
+  %lengthA = call i32 @length(i8* %arrA)
+  %lengthB = call i16 @short_length(i8* %arrB)
+   br label %loop
+
+loop:
+; CHECK-LABEL: loop:
+; CHECK: [[invariant_check:[^ ]+]] = and i1 [[WideChkB]], [[WideChkA]]
+; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 [[invariant_check]], i32 9)
+  %iv = phi i64 [0, %entry ], [ %iv.next, %loop ]
+  %iv.trunc.32 = trunc i64 %iv to i32
+  %iv.trunc.16 = trunc i64 %iv to i16
+  %indexA = add i32 %iv.trunc.32, %offA
+  %indexB = add i16 %iv.trunc.16, %offB
+  %rcA = icmp ult i32 %indexA, %lengthA
+  %rcB = icmp ult i16 %indexB, %lengthB
+  %wide.chk = and i1 %rcA, %rcB
+  call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk, i32 9) [ "deopt"() ]
+  %indexA.ext = zext i32 %indexA to i64
+  %addrA = getelementptr inbounds i8, i8* %arrA, i64 %indexA.ext
+  %eltA = load i8, i8* %addrA
+  %indexB.ext = zext i16 %indexB to i64
+  %addrB = getelementptr inbounds i8, i8* %arrB, i64 %indexB.ext
+  store i8 %eltA, i8* %addrB
+  %iv.next = add nuw nsw i64 %iv, 1
+  %latch.check = icmp ult i64 %iv.next, 16
+  br i1 %latch.check, label %loop, label %exit
+
+exit:
+ ret i64 %iv
+}
+
+
+; Consider an IV of type long and an array access into int array.
+; IV is of type i64 while the range check operands are of type i32 and i64.
+define i64 @iv_rc_different_types(i32 %offA, i32 %offB, i8* %arrA, i8* %arrB, i64 %max)
+{
+; CHECK-LABEL: iv_rc_different_types
+entry:
+; CHECK-LABEL: entry:
+; CHECK: [[lenB:[^ ]+]] = add i32 %lengthB, -1
+; CHECK-NEXT: [[idxB:[^ ]+]] = sub i32 [[lenB]], %offB
+; CHECK-NEXT: [[limit_checkB:[^ ]+]] = icmp ule i32 15, [[idxB]]
+; CHECK-NEXT: [[first_iteration_checkB:[^ ]+]] = icmp ult i32 %offB, %lengthB
+; CHECK-NEXT: [[WideChkB:[^ ]+]] = and i1 [[first_iteration_checkB]], [[limit_checkB]]
+; CHECK-NEXT: [[maxMinusOne:[^ ]+]] = add i64 %max, -1
+; CHECK-NEXT: [[limit_checkMax:[^ ]+]] = icmp ule i64 15, [[maxMinusOne]]
+; CHECK-NEXT: [[first_iteration_checkMax:[^ ]+]] = icmp ult  i64 0, %max
+; CHECK-NEXT: [[WideChkMax:[^ ]+]] = and i1 [[first_iteration_checkMax]], [[limit_checkMax]]
+; CHECK-NEXT: [[lenA:[^ ]+]] = add i32 %lengthA, -1
+; CHECK-NEXT: [[idxA:[^ ]+]] = sub i32 [[lenA]], %offA
+; CHECK-NEXT: [[limit_checkA:[^ ]+]] = icmp ule i32 15, [[idxA]]
+; CHECK-NEXT: [[first_iteration_checkA:[^ ]+]] = icmp ult i32 %offA, %lengthA
+; CHECK-NEXT: [[WideChkA:[^ ]+]] = and i1 [[first_iteration_checkA]], [[limit_checkA]]
+  %lengthA = call i32 @length(i8* %arrA)
+  %lengthB = call i32 @length(i8* %arrB)
+  br label %loop
+
+loop:
+; CHECK-LABEL: loop:
+; CHECK: [[BandMax:[^ ]+]] = and i1 [[WideChkB]], [[WideChkMax]]
+; CHECK: [[ABandMax:[^ ]+]] = and i1 [[BandMax]], [[WideChkA]]
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[ABandMax]], i32 9)
+  %iv = phi i64 [0, %entry ], [ %iv.next, %loop ]
+  %iv.trunc = trunc i64 %iv to i32
+  %indexA = add i32 %iv.trunc, %offA
+  %indexB = add i32 %iv.trunc, %offB
+  %rcA = icmp ult i32 %indexA, %lengthA
+  %rcIV = icmp ult i64 %iv, %max
+  %wide.chk = and i1 %rcA, %rcIV
+  %rcB = icmp ult i32 %indexB, %lengthB
+  %wide.chk.final = and i1 %wide.chk, %rcB
+  call void (i1, ...) @llvm.experimental.guard(i1 %wide.chk.final, i32 9) [ "deopt"() ]
+  %indexA.ext = zext i32 %indexA to i64
+  %addrA = getelementptr inbounds i8, i8* %arrA, i64 %indexA.ext
+  %eltA = load i8, i8* %addrA
+  %indexB.ext = zext i32 %indexB to i64
+  %addrB = getelementptr inbounds i8, i8* %arrB, i64 %indexB.ext
+  %eltB = load i8, i8* %addrB
+  %result = xor i8 %eltA, %eltB
+  store i8 %result, i8* %addrA
+  %iv.next = add nuw nsw i64 %iv, 1
+  %latch.check = icmp ult i64 %iv, 15
+  br i1 %latch.check, label %loop, label %exit
+
+exit:
+  ret i64 %iv
+}
+
+; cannot narrow the IV to the range type, because we lose information.
+; for (i64 i= 5; i>= 2; i++)
+; this loop wraps around after reaching 2^64.
+define i64 @iv_rc_different_type(i32 %offA, i8* %arrA) {
+; CHECK-LABEL: iv_rc_different_type
+entry:
+  %lengthA = call i32 @length(i8* %arrA)
+  br label %loop
+
+loop:
+; CHECK-LABEL: loop:
+; CHECK: %rcA = icmp ult i32 %indexA, %lengthA
+; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %rcA, i32 9)
+  %iv = phi i64 [ 5, %entry ], [ %iv.next, %loop ]
+  %iv.trunc.32 = trunc i64 %iv to i32
+  %indexA = add i32 %iv.trunc.32, %offA
+  %rcA = icmp ult i32 %indexA, %lengthA
+  call void (i1, ...) @llvm.experimental.guard(i1 %rcA, i32 9) [ "deopt"() ]
+  %indexA.ext = zext i32 %indexA to i64
+  %addrA = getelementptr inbounds i8, i8* %arrA, i64 %indexA.ext
+  %eltA = load i8, i8* %addrA
+  %res = add i8 %eltA, 2
+  store i8 %eltA, i8* %addrA
+  %iv.next = add i64 %iv, 1
+  %latch.check = icmp sge i64 %iv.next, 2
+  br i1 %latch.check, label %loop, label %exit
+
+exit:
+ ret i64 %iv
+}