[LV] Clamp VF hint when unsafe

In the following loop the dependence distance is 2 and can only be vectorized if the vector length is no larger than this. void foo(int *a, int *b, int N) { #pragma clang loop vectorize(enable) vectorize_width(4) for (int i=0; i<N; ++i) { a[i + 2] = a[i] + b[i]; } } However, when specifying a VF of 4 via a loop hint this loop is vectorized. According to [1][2], loop hints are ignored if the optimization is not safe to apply. This patch introduces a check to bail of vectorization if the user specified VF is greater than the maximum feasible VF, unless explicitly forced with '-force-vector-width=X'. [1] https://llvm.org/docs/LangRef.html#llvm-loop-vectorize-and-llvm-loop-interleave [2] https://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations Reviewed By: sdesmalen, fhahn, Meinersbur Differential Revision: https://reviews.llvm.org/D90687
2024-11-23 03:02:36 +01:00 · 2020-11-02 13:02:32 +00:00 · 2020-11-02 13:02:32 +00:00 · c640adbe73
commit c640adbe73
parent 32a49915a1
3 changed files with 123 additions and 9 deletions
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -1443,7 +1443,8 @@ private:
  /// \return An upper bound for the vectorization factor, a power-of-2 larger
  /// than zero. One is returned if vectorization should best be avoided due
  /// to cost.
-  ElementCount computeFeasibleMaxVF(unsigned ConstTripCount);
+  ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
+                                    ElementCount UserVF);

  /// The vectorization cost is a combination of the cost itself and a boolean
  /// indicating whether any of the contributing operations will actually
@ -5270,9 +5271,11 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
    return None;
  }

+  ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
+
  switch (ScalarEpilogueStatus) {
  case CM_ScalarEpilogueAllowed:
-    return UserVF ? UserVF : computeFeasibleMaxVF(TC);
+    return MaxVF;
  case CM_ScalarEpilogueNotNeededUsePredicate:
    LLVM_DEBUG(
        dbgs() << "LV: vector predicate hint/switch found.\n"
@ -5308,7 +5311,6 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
    InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
  }

-  ElementCount MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
  assert(!MaxVF.isScalable() &&
         "Scalable vectors do not yet support tail folding");
  assert((UserVF.isNonZero() || isPowerOf2_32(MaxVF.getFixedValue())) &&
@ -5361,7 +5363,9 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
 }

 ElementCount
-LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
+LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
+                                                 ElementCount UserVF) {
+  assert(!UserVF.isScalable() && "scalable vectorization not yet handled");
  MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
  unsigned SmallestType, WidestType;
  std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@ -5373,6 +5377,27 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
  // dependence distance).
  unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();

+  if (UserVF.isNonZero()) {
+    // If legally unsafe, clamp the user vectorization factor to a safe value.
+    unsigned MaxSafeVF = PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
+    if (UserVF.getFixedValue() <= MaxSafeVF)
+      return UserVF;
+
+    LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
+                      << " is unsafe, clamping to max safe VF=" << MaxSafeVF
+                      << ".\n");
+    ORE->emit([&]() {
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
+                                        TheLoop->getStartLoc(),
+                                        TheLoop->getHeader())
+             << "User-specified vectorization factor "
+             << ore::NV("UserVectorizationFactor", UserVF)
+             << " is unsafe, clamping to maximum safe vectorization factor "
+             << ore::NV("VectorizationFactor", MaxSafeVF);
+    });
+    return ElementCount::getFixed(MaxSafeVF);
+  }
+
  WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);

  // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
@ -7031,9 +7056,12 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
      CM.invalidateCostModelingDecisions();
  }

-  if (!UserVF.isZero()) {
+  ElementCount MaxVF = MaybeMaxVF.getValue();
+  assert(MaxVF.isNonZero() && "MaxVF is zero.");
+
+  if (!UserVF.isZero() && UserVF.getFixedValue() <= MaxVF.getFixedValue()) {
    LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-    assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
+    assert(isPowerOf2_32(UserVF.getFixedValue()) &&
           "VF needs to be a power of two");
    // Collect the instructions (and their associated costs) that will be more
    // profitable to scalarize.
@ -7044,9 +7072,6 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
    return {{UserVF, 0}};
  }

-  ElementCount MaxVF = MaybeMaxVF.getValue();
-  assert(MaxVF.isNonZero() && "MaxVF is zero.");
-
  for (ElementCount VF = ElementCount::getFixed(1);
       ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
    // Collect Uniform and Scalar instructions after vectorization with VF.
--- a/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll
+++ b/test/Transforms/LoopVectorize/AArch64/unsafe-vf-hint-remark.ll
@ -0,0 +1,43 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -mtriple=arm64-apple-iphoneos -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+; Specify a large unsafe vectorization factor of 32 that gets clamped to 16,
+; then test an even smaller VF of 2 is selected based on the cost-model.
+
+; CHECK: LV: User VF=32 is unsafe, clamping to max safe VF=16.
+; CHECK: remark: <unknown>:0:0: User-specified vectorization factor 32 is unsafe, clamping to maximum safe vectorization factor 16
+; CHECK: LV: Selecting VF: 2.
+; CHECK-LABEL: @test
+; CHECK: <2 x i64>
+define void @test(i64* nocapture %a, i64* nocapture readonly %b) {
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %latch ]
+  %arrayidx = getelementptr inbounds i64, i64* %a, i64 %iv
+  %0 = load i64, i64* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i64, i64* %b, i64 %iv
+  %1 = load i64, i64* %arrayidx2, align 4
+  %add = add nsw i64 %1, %0
+  %2 = add nuw nsw i64 %iv, 16
+  %arrayidx5 = getelementptr inbounds i64, i64* %a, i64 %2
+  %c = icmp eq i64 %1, 120
+  br i1 %c, label %then, label %latch
+
+then:
+  store i64 %add, i64* %arrayidx5, align 4
+  br label %latch
+
+latch:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop.header, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.width", i64 32}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
--- a/test/Transforms/LoopVectorize/unsafe-vf-hint-remark.ll
+++ b/test/Transforms/LoopVectorize/unsafe-vf-hint-remark.ll
@ -0,0 +1,46 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -pass-remarks-analysis=loop-vectorize -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+; Make sure the unsafe user specified vectorization factor is clamped.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; void foo(int *a, int *b) {
+;   #pragma clang loop vectorize(enable) vectorize_width(4)
+;   for (int i=0; i < 1024; ++i) {
+;     a[i + 2] = a[i] + b[i];
+;   }
+; }
+
+; CHECK: LV: User VF=4 is unsafe, clamping to max safe VF=2.
+; CHECK: remark: <unknown>:0:0: User-specified vectorization factor 4 is unsafe, clamping to maximum safe vectorization factor 2
+; CHECK-LABEL: @foo
+; CHECK: <2 x i32>
+define void @foo(i32* %a, i32* %b) {
+entry:
+  br label %loop.ph
+
+loop.ph:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %loop.ph ], [ %iv.next, %loop ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %2 = add nuw nsw i64 %iv, 2
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2
+  store i32 %add, i32* %arrayidx5, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %exit, label %loop, !llvm.loop !0
+
+exit:
+  ret void
+}
+
+!0 = !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.width", i32 4}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}