[SVE] Add support for scalable vectors with vectorize.scalable.enable loop attribute

In this patch I have added support for a new loop hint called vectorize.scalable.enable that says whether we should enable scalable vectorization or not. If a user wants to instruct the compiler to vectorize a loop with scalable vectors they can now do this as follows: br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2 ... !2 = !{!2, !3, !4} !3 = !{!"llvm.loop.vectorize.width", i32 8} !4 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} Setting the hint to false simply reverts the behaviour back to the default, using fixed width vectors. Differential Revision: https://reviews.llvm.org/D88962
2024-11-22 10:42:39 +01:00 · 2020-10-07 09:21:39 +01:00 · 2020-10-07 09:21:39 +01:00 · 6d7c7dcc2b
commit 6d7c7dcc2b
parent ae44f6b6df
9 changed files with 210 additions and 24 deletions
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@ -5956,6 +5956,21 @@ vectorization:
   !0 = !{!"llvm.loop.vectorize.predicate.enable", i1 0}
   !1 = !{!"llvm.loop.vectorize.predicate.enable", i1 1}

+'``llvm.loop.vectorize.scalable.enable``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This metadata selectively enables or disables scalable vectorization for the
+loop, and only has any effect if vectorization for the loop is already enabled.
+The first operand is the string ``llvm.loop.vectorize.scalable.enable``
+and the second operand is a bit. If the bit operand value is 1 scalable
+vectorization is enabled, whereas a value of 0 reverts to the default fixed
+width vectorization:
+
+.. code-block:: llvm
+
+   !0 = !{!"llvm.loop.vectorize.scalable.enable", i1 0}
+   !1 = !{!"llvm.loop.vectorize.scalable.enable", i1 1}
+
 '``llvm.loop.vectorize.width``' Metadata
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

--- a/include/llvm/Transforms/Utils/LoopUtils.h
+++ b/include/llvm/Transforms/Utils/LoopUtils.h
@ -213,6 +213,13 @@ Optional<const MDOperand *> findStringMetadataForLoop(const Loop *TheLoop,
 /// Find named metadata for a loop with an integer value.
 llvm::Optional<int> getOptionalIntLoopAttribute(Loop *TheLoop, StringRef Name);

+/// Find a combination of metadata ("llvm.loop.vectorize.width" and
+/// "llvm.loop.vectorize.scalable.enable") for a loop and use it to construct a
+/// ElementCount. If the metadata "llvm.loop.vectorize.width" cannot be found
+/// then None is returned.
+Optional<ElementCount>
+getOptionalElementCountLoopAttribute(Loop *TheLoop);
+
 /// Create a new loop identifier for a loop created from a loop transformation.
 ///
 /// @param OrigLoopID The loop ID of the loop before the transformation.
--- a/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ b/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@ -29,6 +29,7 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Support/TypeSize.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"

 namespace llvm {
@ -43,8 +44,14 @@ namespace llvm {
 /// for example 'force', means a decision has been made. So, we need to be
 /// careful NOT to add them if the user hasn't specifically asked so.
 class LoopVectorizeHints {
-  enum HintKind { HK_WIDTH, HK_UNROLL, HK_FORCE, HK_ISVECTORIZED,
-                  HK_PREDICATE };
+  enum HintKind {
+    HK_WIDTH,
+    HK_UNROLL,
+    HK_FORCE,
+    HK_ISVECTORIZED,
+    HK_PREDICATE,
+    HK_SCALABLE
+  };

  /// Hint - associates name and validation with the hint value.
  struct Hint {
@ -73,6 +80,9 @@ class LoopVectorizeHints {
  /// Vector Predicate
  Hint Predicate;

+  /// Says whether we should use fixed width or scalable vectorization.
+  Hint Scalable;
+
  /// Return the loop metadata prefix.
  static StringRef Prefix() { return "llvm.loop."; }

@ -98,7 +108,9 @@ public:
  /// Dumps all the hint information.
  void emitRemarkWithHints() const;

-  unsigned getWidth() const { return Width.Value; }
+  ElementCount getWidth() const {
+    return ElementCount::get(Width.Value, isScalable());
+  }
  unsigned getInterleave() const { return Interleave.Value; }
  unsigned getIsVectorized() const { return IsVectorized.Value; }
  unsigned getPredicate() const { return Predicate.Value; }
@ -109,6 +121,8 @@ public:
    return (ForceKind)Force.Value;
  }

+  bool isScalable() const { return Scalable.Value; }
+
  /// If hints are provided that force vectorization, use the AlwaysPrint
  /// pass name to force the frontend to print the diagnostic.
  const char *vectorizeAnalysisPassName() const;
@ -119,7 +133,9 @@ public:
    // enabled by default because can be unsafe or inefficient. For example,
    // reordering floating-point operations will change the way round-off
    // error accumulates in the loop.
-    return getForce() == LoopVectorizeHints::FK_Enabled || getWidth() > 1;
+    ElementCount EC = getWidth();
+    return getForce() == LoopVectorizeHints::FK_Enabled ||
+           EC.getKnownMinValue() > 1;
  }

  bool isPotentiallyUnsafe() const {
--- a/lib/Transforms/Scalar/WarnMissedTransforms.cpp
+++ b/lib/Transforms/Scalar/WarnMissedTransforms.cpp
@ -48,12 +48,12 @@ static void warnAboutLeftoverTransformations(Loop *L,

  if (hasVectorizeTransformation(L) == TM_ForcedByUser) {
    LLVM_DEBUG(dbgs() << "Leftover vectorization transformation\n");
-    Optional<int> VectorizeWidth =
-        getOptionalIntLoopAttribute(L, "llvm.loop.vectorize.width");
+    Optional<ElementCount> VectorizeWidth =
+        getOptionalElementCountLoopAttribute(L);
    Optional<int> InterleaveCount =
        getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count");

-    if (VectorizeWidth.getValueOr(0) != 1)
+    if (!VectorizeWidth || VectorizeWidth->isVector())
      ORE->emit(
          DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
                                            "FailedRequestedVectorization",
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@ -301,6 +301,21 @@ bool llvm::getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name) {
  return getOptionalBoolLoopAttribute(TheLoop, Name).getValueOr(false);
 }

+Optional<ElementCount>
+llvm::getOptionalElementCountLoopAttribute(Loop *TheLoop) {
+  Optional<int> Width =
+      getOptionalIntLoopAttribute(TheLoop, "llvm.loop.vectorize.width");
+
+  if (Width.hasValue()) {
+    Optional<int> IsScalable = getOptionalIntLoopAttribute(
+        TheLoop, "llvm.loop.vectorize.scalable.enable");
+    return ElementCount::get(*Width,
+                             IsScalable.hasValue() ? *IsScalable : false);
+  }
+
+  return None;
+}
+
 llvm::Optional<int> llvm::getOptionalIntLoopAttribute(Loop *TheLoop,
                                                      StringRef Name) {
  const MDOperand *AttrMD =
@ -450,14 +465,15 @@ TransformationMode llvm::hasVectorizeTransformation(Loop *L) {
  if (Enable == false)
    return TM_SuppressedByUser;

-  Optional<int> VectorizeWidth =
-      getOptionalIntLoopAttribute(L, "llvm.loop.vectorize.width");
+  Optional<ElementCount> VectorizeWidth =
+      getOptionalElementCountLoopAttribute(L);
  Optional<int> InterleaveCount =
      getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count");

  // 'Forcing' vector width and interleave count to one effectively disables
  // this tranformation.
-  if (Enable == true && VectorizeWidth == 1 && InterleaveCount == 1)
+  if (Enable == true && VectorizeWidth && VectorizeWidth->isScalar() &&
+      InterleaveCount == 1)
    return TM_SuppressedByUser;

  if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
@ -466,10 +482,10 @@ TransformationMode llvm::hasVectorizeTransformation(Loop *L) {
  if (Enable == true)
    return TM_ForcedByUser;

-  if (VectorizeWidth == 1 && InterleaveCount == 1)
+  if ((VectorizeWidth && VectorizeWidth->isScalar()) && InterleaveCount == 1)
    return TM_Disable;

-  if (VectorizeWidth > 1 || InterleaveCount > 1)
+  if ((VectorizeWidth && VectorizeWidth->isVector()) || InterleaveCount > 1)
    return TM_Enable;

  if (hasDisableAllTransformsHint(L))
--- a/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@ -66,6 +66,7 @@ bool LoopVectorizeHints::Hint::validate(unsigned Val) {
    return (Val <= 1);
  case HK_ISVECTORIZED:
  case HK_PREDICATE:
+  case HK_SCALABLE:
    return (Val == 0 || Val == 1);
  }
  return false;
@ -78,7 +79,8 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
      Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL),
      Force("vectorize.enable", FK_Undefined, HK_FORCE),
      IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
-      Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE), TheLoop(L),
+      Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
+      Scalable("vectorize.scalable.enable", false, HK_SCALABLE), TheLoop(L),
      ORE(ORE) {
  // Populate values with existing loop metadata.
  getHintsFromMetadata();
@ -91,7 +93,8 @@ LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
    // If the vectorization width and interleaving count are both 1 then
    // consider the loop to have been already vectorized because there's
    // nothing more that we can do.
-    IsVectorized.Value = Width.Value == 1 && Interleave.Value == 1;
+    IsVectorized.Value =
+        getWidth() == ElementCount::getFixed(1) && Interleave.Value == 1;
  LLVM_DEBUG(if (InterleaveOnlyWhenForced && Interleave.Value == 1) dbgs()
             << "LV: Interleaving disabled by the pass manager\n");
 }
@ -164,7 +167,7 @@ void LoopVectorizeHints::emitRemarkWithHints() const {
      if (Force.Value == LoopVectorizeHints::FK_Enabled) {
        R << " (Force=" << NV("Force", true);
        if (Width.Value != 0)
-          R << ", Vector Width=" << NV("VectorWidth", Width.Value);
+          R << ", Vector Width=" << NV("VectorWidth", getWidth());
        if (Interleave.Value != 0)
          R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value);
        R << ")";
@ -175,11 +178,11 @@ void LoopVectorizeHints::emitRemarkWithHints() const {
 }

 const char *LoopVectorizeHints::vectorizeAnalysisPassName() const {
-  if (getWidth() == 1)
+  if (getWidth() == ElementCount::getFixed(1))
    return LV_NAME;
  if (getForce() == LoopVectorizeHints::FK_Disabled)
    return LV_NAME;
-  if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth() == 0)
+  if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero())
    return LV_NAME;
  return OptimizationRemarkAnalysis::AlwaysPrint;
 }
@ -230,7 +233,8 @@ void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
    return;
  unsigned Val = C->getZExtValue();

-  Hint *Hints[] = {&Width, &Interleave, &Force, &IsVectorized, &Predicate};
+  Hint *Hints[] = {&Width,        &Interleave, &Force,
+                   &IsVectorized, &Predicate,  &Scalable};
  for (auto H : Hints) {
    if (Name == H->Name) {
      if (H->validate(Val))
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -8314,11 +8314,16 @@ static bool processLoopInVPlanNativePath(
  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);

  // Get user vectorization factor.
-  const unsigned UserVF = Hints.getWidth();
+  ElementCount UserVF = Hints.getWidth();
+  if (UserVF.isScalable()) {
+    // TODO: Use scalable UserVF once we've added initial support for scalable
+    // vectorization. For now we convert it to fixed width, but this will be
+    // removed in a later patch.
+    UserVF = ElementCount::getFixed(UserVF.getKnownMinValue());
+  }

  // Plan how to best vectorize, return the best VF and its cost.
-  const VectorizationFactor VF =
-      LVP.planInVPlanNativePath(ElementCount::getFixed(UserVF));
+  const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);

  // If we are stress testing VPlan builds, do not attempt to generate vector
  // code. Masked vector code generation support will follow soon.
@ -8480,12 +8485,18 @@ bool LoopVectorizePass::processLoop(Loop *L) {
  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);

  // Get user vectorization factor and interleave count.
-  unsigned UserVF = Hints.getWidth();
+  ElementCount UserVF = Hints.getWidth();
+  if (UserVF.isScalable()) {
+    // TODO: Use scalable UserVF once we've added initial support for scalable
+    // vectorization. For now we convert it to fixed width, but this will be
+    // removed in a later patch.
+    UserVF = ElementCount::getFixed(UserVF.getKnownMinValue());
+  }
+
  unsigned UserIC = Hints.getInterleave();

  // Plan how to best vectorize, return the best VF and its cost.
-  Optional<VectorizationFactor> MaybeVF =
-      LVP.plan(ElementCount::getFixed(UserVF), UserIC);
+  Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);

  VectorizationFactor VF = VectorizationFactor::Disabled();
  unsigned IC = 1;
--- a/test/Transforms/LoopVectorize/metadata-width.ll
+++ b/test/Transforms/LoopVectorize/metadata-width.ll
@ -24,7 +24,55 @@ for.end:                                          ; preds = %for.body, %entry
  ret void
 }

+; CHECK-LABEL: @test2(
+; CHECK: store <8 x i32>
+; CHECK: ret void
+define void @test2(i32* nocapture %a, i32 %n) #0 {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = trunc i64 %indvars.iv to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !2
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; CHECK-LABEL: @test3(
+; CHECK: store <8 x i32>
+; CHECK: ret void
+define void @test3(i32* nocapture %a, i32 %n) #0 {
+entry:
+  %cmp4 = icmp sgt i32 %n, 0
+  br i1 %cmp4, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = trunc i64 %indvars.iv to i32
+  store i32 %0, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !4
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
 attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }

 !0 = !{!0, !1}
 !1 = !{!"llvm.loop.vectorize.width", i32 8}
+!2 = !{!2, !1, !3}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i32 1}
+!4 = !{!4, !1, !5}
+!5 = !{!"llvm.loop.vectorize.scalable.enable", i32 0}
--- a/test/Transforms/LoopVectorize/no_array_bounds_scalable.ll
+++ b/test/Transforms/LoopVectorize/no_array_bounds_scalable.ll
@ -0,0 +1,69 @@
+; RUN: opt < %s -loop-vectorize -transform-warning -S 2>&1 | FileCheck %s
+
+; Like no_array_bounds.ll we verify warnings are generated when vectorization/interleaving is
+; explicitly specified and fails to occur for both fixed and scalable vectorize.width loop hints.
+
+;  #pragma clang loop vectorize(enable)
+;  for (int i = 0; i < number; i++) {
+;    A[B[i]]++;
+;  }
+
+; CHECK: warning: <unknown>:0:0: loop not interleaved: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+define dso_local void @foo(i32* nocapture %A, i32* nocapture readonly %B, i32 %N) {
+entry:
+  %cmp7 = icmp sgt i32 %N, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %idxprom1 = sext i32 %0 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %idxprom1
+  %1 = load i32, i32* %arrayidx2, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; CHECK: warning: <unknown>:0:0: loop not vectorized: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering
+define dso_local void @foo2(i32* nocapture %A, i32* nocapture readonly %B, i32 %N) {
+entry:
+  %cmp7 = icmp sgt i32 %N, 0
+  br i1 %cmp7, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %idxprom1 = sext i32 %0 to i64
+  %arrayidx2 = getelementptr inbounds i32, i32* %A, i64 %idxprom1
+  %1 = load i32, i32* %arrayidx2, align 4
+  %inc = add nsw i32 %1, 1
+  store i32 %inc, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !3
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.vectorize.enable", i1 true}
+!2 = !{!"llvm.loop.vectorize.width", i32 1}
+!3 = distinct !{!3, !1, !2, !4}
+!4 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}