[LV] Vectorize without versioning-for-unit-stride under -Os/-Oz

If a loop is in a function marked OptSize, Loop Access Analysis should refrain from generating runtime checks for unit strides that will version the loop. If a loop is in a function marked OptSize and its vectorization is enabled, it should be vectorized w/o any versioning. Fixes PR46228. Differential Revision: https://reviews.llvm.org/D81345
2025-01-31 12:41:49 +01:00 · 2020-06-07 11:36:57 +03:00 · 2020-06-07 11:36:57 +03:00 · dfe7df7bd4
commit dfe7df7bd4
parent d46bc6cdf6
7 changed files with 202 additions and 180 deletions
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@ -1835,6 +1835,10 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,

  const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();

+  const bool EnableMemAccessVersioningOfLoop =
+      EnableMemAccessVersioning &&
+      !TheLoop->getHeader()->getParent()->hasOptSize();
+
  // For each block.
  for (BasicBlock *BB : TheLoop->blocks()) {
    // Scan the BB and collect legal loads and stores. Also detect any
@ -1890,7 +1894,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
        NumLoads++;
        Loads.push_back(Ld);
        DepChecker->addAccess(Ld);
-        if (EnableMemAccessVersioning)
+        if (EnableMemAccessVersioningOfLoop)
          collectStridedAccess(Ld);
        continue;
      }
@ -1914,7 +1918,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
        NumStores++;
        Stores.push_back(St);
        DepChecker->addAccess(St);
-        if (EnableMemAccessVersioning)
+        if (EnableMemAccessVersioningOfLoop)
          collectStridedAccess(St);
      }
    } // Next instr.
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -4937,15 +4937,8 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
    return true;
  }

-  // FIXME: Avoid specializing for stride==1 instead of bailing out.
-  if (!Legal->getLAI()->getSymbolicStrides().empty()) {
-    reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
-        "runtime stride == 1 checks needed. Enable vectorization of "
-        "this loop with '#pragma clang loop vectorize(enable)' when "
-        "compiling with -Os/-Oz",
-        "CantVersionLoopWithOptForSize", ORE, TheLoop);
-    return true;
-  }
+  assert(Legal->getLAI()->getSymbolicStrides().empty() &&
+         "Specializing for stride == 1 under -Os/-Oz");

  return false;
 }
@ -7611,7 +7604,7 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
                                                     PGSOQueryType::IRPass);
  // 1) OptSize takes precedence over all other options, i.e. if this is set,
  // don't look at hints or options, and don't request a scalar epilogue.
-  if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
+  if (OptSize)
    return CM_ScalarEpilogueNotAllowedOptSize;

  bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
--- a/test/Transforms/LoopVectorize/X86/optsize.ll
+++ b/test/Transforms/LoopVectorize/X86/optsize.ll
@ -218,38 +218,35 @@ for.end:                                          ; preds = %for.body
 attributes #1 = { minsize }


-; We can't vectorize this one because we version for stride==1; even having TC
-; a multiple of VF.
+; We can vectorize this one by refraining from versioning for stride==1.
 define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #2 {
 ; CHECK-LABEL: @scev4stride1(
 ; CHECK-NEXT:  for.body.preheader:
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ]
-; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I_07]], [[K:%.*]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[MUL]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_07]]
-; CHECK-NEXT:    store i32 [[TMP0]], i32* [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_07]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 256
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <64 x i32> undef, i32 [[K:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <64 x i32> [[BROADCAST_SPLATINSERT]], <64 x i32> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 256, 256
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    ret void
 ;
 ; AUTOVF-LABEL: @scev4stride1(
 ; AUTOVF-NEXT:  for.body.preheader:
-; AUTOVF-NEXT:    br label [[FOR_BODY:%.*]]
-; AUTOVF:       for.body:
-; AUTOVF-NEXT:    [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER:%.*]] ]
-; AUTOVF-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I_07]], [[K:%.*]]
-; AUTOVF-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[MUL]]
-; AUTOVF-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; AUTOVF-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[I_07]]
-; AUTOVF-NEXT:    store i32 [[TMP0]], i32* [[ARRAYIDX1]], align 4
-; AUTOVF-NEXT:    [[INC]] = add nuw nsw i32 [[I_07]], 1
-; AUTOVF-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 256
-; AUTOVF-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
+; AUTOVF-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; AUTOVF:       vector.ph:
+; AUTOVF-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 [[K:%.*]], i32 0
+; AUTOVF-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; AUTOVF-NEXT:    br label [[VECTOR_BODY:%.*]]
+; AUTOVF:       vector.body:
+; AUTOVF:       middle.block:
+; AUTOVF-NEXT:    [[CMP_N:%.*]] = icmp eq i32 256, 256
+; AUTOVF-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; AUTOVF:       scalar.ph:
 ; AUTOVF:       for.end.loopexit:
 ; AUTOVF-NEXT:    ret void
 ;
--- a/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
+++ b/test/Transforms/LoopVectorize/X86/tail_loop_folding.ll
@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -loop-vectorize -S | FileCheck %s --check-prefixes=CHECK,DEFAULT
-; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s --check-prefixes=CHECK,PREDFLAG
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s

 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@ -74,141 +74,56 @@ for.body:
  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !6
 }

+; Marking function as optsize turns tail folding on, as if explicit tail folding
+; flag was enabled.
 define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
-; DEFAULT-LABEL: @tail_folding_disabled(
-; DEFAULT-NEXT:  entry:
-; DEFAULT-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; DEFAULT:       vector.ph:
-; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
-; DEFAULT:       vector.body:
-; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; DEFAULT-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 8
-; DEFAULT-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 16
-; DEFAULT-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 24
-; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP1]]
-; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP2]]
-; DEFAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[TMP3]]
-; DEFAULT-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
-; DEFAULT-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
-; DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4
-; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 8
-; DEFAULT-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
-; DEFAULT-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP11]], align 4
-; DEFAULT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 16
-; DEFAULT-NEXT:    [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>*
-; DEFAULT-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i32>, <8 x i32>* [[TMP13]], align 4
-; DEFAULT-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 24
-; DEFAULT-NEXT:    [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <8 x i32>*
-; DEFAULT-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i32>, <8 x i32>* [[TMP15]], align 4
-; DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]]
-; DEFAULT-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[TMP1]]
-; DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[TMP2]]
-; DEFAULT-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[TMP3]]
-; DEFAULT-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 0
-; DEFAULT-NEXT:    [[TMP21:%.*]] = bitcast i32* [[TMP20]] to <8 x i32>*
-; DEFAULT-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i32>, <8 x i32>* [[TMP21]], align 4
-; DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 8
-; DEFAULT-NEXT:    [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <8 x i32>*
-; DEFAULT-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i32>, <8 x i32>* [[TMP23]], align 4
-; DEFAULT-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 16
-; DEFAULT-NEXT:    [[TMP25:%.*]] = bitcast i32* [[TMP24]] to <8 x i32>*
-; DEFAULT-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i32>, <8 x i32>* [[TMP25]], align 4
-; DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, i32* [[TMP16]], i32 24
-; DEFAULT-NEXT:    [[TMP27:%.*]] = bitcast i32* [[TMP26]] to <8 x i32>*
-; DEFAULT-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i32>, <8 x i32>* [[TMP27]], align 4
-; DEFAULT-NEXT:    [[TMP28:%.*]] = add nsw <8 x i32> [[WIDE_LOAD4]], [[WIDE_LOAD]]
-; DEFAULT-NEXT:    [[TMP29:%.*]] = add nsw <8 x i32> [[WIDE_LOAD5]], [[WIDE_LOAD1]]
-; DEFAULT-NEXT:    [[TMP30:%.*]] = add nsw <8 x i32> [[WIDE_LOAD6]], [[WIDE_LOAD2]]
-; DEFAULT-NEXT:    [[TMP31:%.*]] = add nsw <8 x i32> [[WIDE_LOAD7]], [[WIDE_LOAD3]]
-; DEFAULT-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; DEFAULT-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP1]]
-; DEFAULT-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP2]]
-; DEFAULT-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]]
-; DEFAULT-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 0
-; DEFAULT-NEXT:    [[TMP37:%.*]] = bitcast i32* [[TMP36]] to <8 x i32>*
-; DEFAULT-NEXT:    store <8 x i32> [[TMP28]], <8 x i32>* [[TMP37]], align 4
-; DEFAULT-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 8
-; DEFAULT-NEXT:    [[TMP39:%.*]] = bitcast i32* [[TMP38]] to <8 x i32>*
-; DEFAULT-NEXT:    store <8 x i32> [[TMP29]], <8 x i32>* [[TMP39]], align 4
-; DEFAULT-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 16
-; DEFAULT-NEXT:    [[TMP41:%.*]] = bitcast i32* [[TMP40]] to <8 x i32>*
-; DEFAULT-NEXT:    store <8 x i32> [[TMP30]], <8 x i32>* [[TMP41]], align 4
-; DEFAULT-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, i32* [[TMP32]], i32 24
-; DEFAULT-NEXT:    [[TMP43:%.*]] = bitcast i32* [[TMP42]] to <8 x i32>*
-; DEFAULT-NEXT:    store <8 x i32> [[TMP31]], <8 x i32>* [[TMP43]], align 4
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 32
-; DEFAULT-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], 416
-; DEFAULT-NEXT:    br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
-; DEFAULT:       middle.block:
-; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 430, 416
-; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; DEFAULT:       scalar.ph:
-; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 416, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
-; DEFAULT:       for.cond.cleanup:
-; DEFAULT-NEXT:    ret void
-; DEFAULT:       for.body:
-; DEFAULT-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; DEFAULT-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; DEFAULT-NEXT:    [[TMP45:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; DEFAULT-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; DEFAULT-NEXT:    [[TMP46:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; DEFAULT-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP46]], [[TMP45]]
-; DEFAULT-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; DEFAULT-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
-; DEFAULT-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; DEFAULT-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430
-; DEFAULT-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !5
-;
-; PREDFLAG-LABEL: @tail_folding_disabled(
-; PREDFLAG-NEXT:  entry:
-; PREDFLAG-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; PREDFLAG:       vector.ph:
-; PREDFLAG-NEXT:    br label [[VECTOR_BODY:%.*]]
-; PREDFLAG:       vector.body:
-; PREDFLAG-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDFLAG-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
-; PREDFLAG-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
-; PREDFLAG-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
-; PREDFLAG-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; PREDFLAG-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; PREDFLAG-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429>
-; PREDFLAG-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
-; PREDFLAG-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>*
-; PREDFLAG-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x i32> undef)
-; PREDFLAG-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]]
-; PREDFLAG-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
-; PREDFLAG-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
-; PREDFLAG-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP2]], <8 x i32> undef)
-; PREDFLAG-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
-; PREDFLAG-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; PREDFLAG-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
-; PREDFLAG-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
-; PREDFLAG-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP2]])
-; PREDFLAG-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
-; PREDFLAG-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
-; PREDFLAG-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
-; PREDFLAG:       middle.block:
-; PREDFLAG-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
-; PREDFLAG:       scalar.ph:
-; PREDFLAG-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 432, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; PREDFLAG-NEXT:    br label [[FOR_BODY:%.*]]
-; PREDFLAG:       for.cond.cleanup:
-; PREDFLAG-NEXT:    ret void
-; PREDFLAG:       for.body:
-; PREDFLAG-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
-; PREDFLAG-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
-; PREDFLAG-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; PREDFLAG-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
-; PREDFLAG-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
-; PREDFLAG-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]]
-; PREDFLAG-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
-; PREDFLAG-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
-; PREDFLAG-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; PREDFLAG-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430
-; PREDFLAG-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !5
+; CHECK-LABEL: @tail_folding_disabled(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i64> undef, i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT]], <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <8 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule <8 x i64> [[INDUCTION]], <i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429, i64 429>
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP4]], i32 4, <8 x i1> [[TMP2]], <8 x i32> undef)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>*
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD1:%.*]] = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* [[TMP7]], i32 4, <8 x i1> [[TMP2]], <8 x i32> undef)
+; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <8 x i32> [[WIDE_MASKED_LOAD1]], [[WIDE_MASKED_LOAD]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
+; CHECK-NEXT:    call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> [[TMP8]], <8 x i32>* [[TMP11]], i32 4, <8 x i1> [[TMP2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 432
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 432, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP14]], [[TMP13]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 430
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop !5
 ;
 entry:
  br label %for.body
--- a/test/Transforms/LoopVectorize/optsize.ll
+++ b/test/Transforms/LoopVectorize/optsize.ll
@ -154,6 +154,73 @@ exit:
  ret i32 %for
 }

+; PR46228: Vectorize w/o versioning for unit stride under optsize and enabled
+; vectorization.
+
+; NOTE: Some assertions have been autogenerated by utils/update_test_checks.py
+define void @stride1(i16* noalias %B, i32 %BStride) optsize {
+; CHECK-LABEL: @stride1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i32> undef, i32 [[BSTRIDE:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i32> [[BROADCAST_SPLATINSERT]], <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE2:%.*]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 1>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE2]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = mul nsw <2 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule <2 x i32> [[VEC_IND]], <i32 1024, i32 1024>
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK:       pred.store.if:
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32> [[TMP0]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i16, i16* [[B:%.*]], i32 [[TMP3]]
+; CHECK-NEXT:    store i16 42, i16* [[TMP4]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; CHECK:       pred.store.continue:
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
+; CHECK-NEXT:    br i1 [[TMP5]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.if1:
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i32> [[TMP0]], i32 1
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, i16* [[B]], i32 [[TMP6]]
+; CHECK-NEXT:    store i16 42, i16* [[TMP7]], align 4
+; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE2]]
+; CHECK:       pred.store.continue2:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1026
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !19
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+; PGSO-LABEL: @stride1(
+; PGSO-NEXT:  entry:
+; PGSO-NEXT:    br i1 false, label %scalar.ph, label %vector.ph
+;
+; NPGSO-LABEL: @stride1(
+; NPGSO-NEXT:  entry:
+; NPGSO-NEXT:    br i1 false, label %scalar.ph, label %vector.ph
+
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i32 [ %iv.next, %for.body ], [ 0, %entry ]
+  %mulB = mul nsw i32 %iv, %BStride
+  %gepOfB = getelementptr inbounds i16, i16* %B, i32 %mulB
+  store i16 42, i16* %gepOfB, align 4
+  %iv.next = add nuw nsw i32 %iv, 1
+  %exitcond = icmp eq i32 %iv.next, 1025
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !15
+
+for.end:
+  ret void
+}
+
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"ProfileSummary", !1}
 !1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
@ -170,3 +237,5 @@ exit:
 !12 = !{i32 999000, i64 100, i32 1}
 !13 = !{i32 999999, i64 1, i32 2}
 !14 = !{!"function_entry_count", i64 0}
+!15 = distinct !{!15, !16}
+!16 = !{!"llvm.loop.vectorize.enable", i1 true}
--- a/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
+++ b/test/Transforms/LoopVectorize/pr39417-optsize-scevchecks.ll
@ -26,13 +26,57 @@ bb68:
  ret void
 }

-; Check that the need for stride==1 check prevents vectorizing a loop under opt
-; for size.
-; CHECK-LABEL: @scev4stride1
-; CHECK-NOT: vector.scevcheck
-; CHECK-NOT: vector.body:
-; CHECK-LABEL: for.body:
+; Check that a loop under opt-for-size is vectorized, w/o checking for
+; stride==1.
+; NOTE: Some assertions have been autogenerated by utils/update_test_checks.py
 define void @scev4stride1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %k) #0 {
+; CHECK-LABEL: @scev4stride1(
+; CHECK-NEXT:  for.body.preheader:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[K:%.*]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 1, i32 2, i32 3>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <4 x i32> [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <4 x i32> [[TMP4]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP4]], i32 2
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <4 x i32> [[TMP4]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[B]], i32 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i32, i32* [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = load i32, i32* [[TMP8]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = load i32, i32* [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = load i32, i32* [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x i32> undef, i32 [[TMP13]], i32 0
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP14]], i32 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x i32> [[TMP18]], i32 [[TMP15]], i32 2
+; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP16]], i32 3
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP0]]
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, i32* [[TMP21]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = bitcast i32* [[TMP22]] to <4 x i32>*
+; CHECK-NEXT:    store <4 x i32> [[TMP20]], <4 x i32>* [[TMP23]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
+; CHECK-NEXT:    [[TMP24:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 1024, 1024
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK:       for.body:
+; CHECK:       for.end.loopexit:
+; CHECK-NEXT:    ret void
+;
 for.body.preheader:
  br label %for.body

--- a/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/test/Transforms/LoopVectorize/runtime-check.ll
@ -162,9 +162,9 @@ loopexit:

 define dso_local void @forced_optsize(i64* noalias nocapture readonly %x_p, i64* noalias nocapture readonly %y_p, i64* noalias nocapture %z_p) minsize optsize {
 ;
-; FORCED_OPTSIZE: remark: <unknown>:0:0: Code-size may be reduced by not forcing vectorization, or by source-code modifications eliminating the need for runtime checks (e.g., adding 'restrict').
+; FORCED_OPTSIZE: remark: <unknown>:0:0: loop not vectorized: runtime pointer checks needed. Enable vectorization of this loop with '#pragma clang loop vectorize(enable)' when compiling with -Os/-Oz
 ; FORCED_OPTSIZE-LABEL: @forced_optsize(
-; FORCED_OPTSIZE:       vector.body:
+; FORCED_OPTSIZE-NOT:       vector.body:
 ;
 entry:
  br label %for.body