[LV] Don't assume isScalarAfterVectorization if one of the uses needs widening.

This fixes an issue that was found in D105199, where a GEP instruction is used both as the address of a store, as well as the value of a store. For the former, the value is scalar after vectorization, but the latter (as value) requires widening. Other code in that function seems to prevent similar cases from happening, but it seems this case was missed. Reviewed By: david-arm Differential Revision: https://reviews.llvm.org/D106164
2025-01-31 12:41:49 +01:00 · 2021-07-26 15:22:01 +01:00 · 2021-07-26 15:22:01 +01:00 · 6651df41b7
commit 6651df41b7
parent 5e51e7ed64
4 changed files with 108 additions and 30 deletions
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -5123,13 +5123,12 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
  auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
    if (isScalarPtrInduction(MemAccess, Ptr)) {
      Worklist.insert(cast<Instruction>(Ptr));
-      Instruction *Update = cast<Instruction>(
-          cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
-      Worklist.insert(Update);
      LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
                        << "\n");
-      LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Update
-                        << "\n");
+
+      Instruction *Update = cast<Instruction>(
+          cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));
+      ScalarPtrs.insert(Update);
      return;
    }
    // We only care about bitcast and getelementptr instructions contained in
@ -5143,11 +5142,12 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
    if (Worklist.count(I))
      return;

-    // If the use of the pointer will be a scalar use, and all users of the
-    // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
-    // place the pointer in PossibleNonScalarPtrs.
-    if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
-          return isa<LoadInst>(U) || isa<StoreInst>(U);
+    // If all users of the pointer will be memory accesses and scalar, place the
+    // pointer in ScalarPtrs. Otherwise, place the pointer in
+    // PossibleNonScalarPtrs.
+    if (llvm::all_of(I->users(), [&](User *U) {
+          return (isa<LoadInst>(U) || isa<StoreInst>(U)) &&
+                 isScalarUse(cast<Instruction>(U), Ptr);
        }))
      ScalarPtrs.insert(I);
    else
--- a/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
+++ b/test/Transforms/LoopVectorize/AArch64/predication_costs.ll
@ -91,7 +91,7 @@ for.end:
 ; Same as predicate_store except we use a pointer PHI to maintain the address
 ;
 ; CHECK: Found new scalar instruction:   %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]
-; CHECK: Found new scalar instruction:   %addr.next = getelementptr inbounds i32, i32* %addr, i64 1
+; CHECK: Found scalar instruction:   %addr.next = getelementptr inbounds i32, i32* %addr, i64 1
 ; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %addr, align 4
 ; CHECK: Found an estimated cost of 0 for VF 2 For instruction:   %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]
 ; CHECK: Found an estimated cost of 3 for VF 2 For instruction: store i32 %tmp2, i32* %addr, align 4
--- a/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
+++ b/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
@ -0,0 +1,81 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -scalable-vectorization=on -S -mtriple=aarch64 -mattr=+sve -debug-only=loop-vectorize < %s 2>&1 | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; In the test below the PHI instruction:
+;  %0 = phi i8* [ %incdec.ptr190, %loop.body ], [ %src, %entry ]
+; has multiple uses, i.e.
+;  1. As a uniform address for the load, and
+;  2. Non-uniform use by the getelementptr + store, which leads to replication.
+
+; CHECK-LABEL:  LV: Checking a loop in "phi_multiple_use"
+; CHECK-NOT:    LV: Found new scalar instruction:   %incdec.ptr190 = getelementptr inbounds i8, i8* %0, i64 1
+;
+; CHECK:        VPlan 'Initial VPlan for VF={vscale x 2},UF>=1' {
+; CHECK-NEXT:   loop.body:
+; CHECK-NEXT:     WIDEN-INDUCTION %index = phi 0, %index.next
+; CHECK-NEXT:     WIDEN-PHI %curchar = phi %curchar.next, %curptr
+; CHECK-NEXT:     WIDEN-PHI %0 = phi %incdec.ptr190, %src
+; CHECK-NEXT:     WIDEN-GEP Var[Inv] ir<%incdec.ptr190> = getelementptr ir<%0>, ir<1>
+; CHECK-NEXT:     WIDEN store ir<%curchar>, ir<%incdec.ptr190>
+; CHECK-NEXT:     WIDEN ir<%1> = load ir<%0>
+; CHECK-NEXT:     WIDEN ir<%2> = add ir<%1>, ir<1>
+; CHECK-NEXT:     WIDEN store ir<%0>, ir<%2>
+; CHECK-NEXT:   No successors
+; CHECK-NEXT:   }
+
+define void @phi_multiple_use(i8** noalias %curptr, i8* noalias %src, i64 %N) #0 {
+; CHECK-LABEL: @phi_multiple_use(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, %vector.ph ], [ {{.*}}, %vector.body ]
+; CHECK-NEXT:    {{.*}} = add i64 [[INDEX1]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX1]], 0
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8*, i8** %curptr, i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[INDEX1]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = add <vscale x 2 x i64> shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 0, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer), [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP3]]
+; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, i8* %src, <vscale x 2 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, <vscale x 2 x i8*> [[NEXT_GEP6]], i64 1
+; CHECK:         store <vscale x 2 x i8*> [[TMP5]], <vscale x 2 x i8*>*
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <vscale x 2 x i8*> [[NEXT_GEP6]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, i8* [[TMP6]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <vscale x 2 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, <vscale x 2 x i8>* [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 2 x i8> [[WIDE_LOAD]],
+; CHECK:         store <vscale x 2 x i8> [[TMP9]], <vscale x 2 x i8>*
+
+entry:
+  br label %loop.body
+
+loop.body:                                    ; preds = %loop.body, %entry
+  %index = phi i64 [ 0, %entry ], [ %index.next, %loop.body ]
+  %curchar = phi i8** [ %curchar.next, %loop.body ], [ %curptr, %entry ]
+  %0 = phi i8* [ %incdec.ptr190, %loop.body ], [ %src, %entry ]
+  %incdec.ptr190 = getelementptr inbounds i8, i8* %0, i64 1
+  %curchar.next = getelementptr inbounds i8*, i8** %curchar, i64 1
+  store i8* %incdec.ptr190, i8** %curchar, align 8
+  %1 = load i8, i8* %0, align 1
+  %2 = add i8 %1, 1
+  store i8 %2, i8* %0, align 1
+  %index.next = add nuw i64 %index, 1
+  %3 = icmp ne i64 %index.next, %N
+  br i1 %3, label %loop.body, label %exit, !llvm.loop !0
+
+exit:                            ; preds = %loop.body
+  ret void
+}
+
+
+attributes #0 = {"target-features"="+sve"}
+
+!0 = distinct !{!0, !1, !2, !3}
+!1 = !{!"llvm.loop.interleave.count", i32 1}
+!2 = !{!"llvm.loop.vectorize.width", i32 2}
+!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
+!4 = !{ !5 }
+!5 = distinct !{ !5, !6 }
+!6 = distinct !{ !7 }
+!7 = distinct !{ !7, !6 }
--- a/test/Transforms/LoopVectorize/pointer-induction.ll
+++ b/test/Transforms/LoopVectorize/pointer-induction.ll
@ -146,26 +146,23 @@ define void @pointer_induction_used_as_vector(i8** noalias %start.1, i8* noalias
 ; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP4]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP4]], i64 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP5]], i64 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP6]], i64 1
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP7]], i64 1
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x i8*> poison, i8* [[TMP6]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x i8*> [[TMP10]], i8* [[TMP7]], i32 1
-; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <4 x i8*> [[TMP11]], i8* [[TMP8]], i32 2
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x i8*> [[TMP12]], i8* [[TMP9]], i32 3
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
-; CHECK-NEXT:    [[TMP15:%.*]] = bitcast i8** [[TMP14]] to <4 x i8*>*
-; CHECK-NEXT:    store <4 x i8*> [[TMP13]], <4 x i8*>* [[TMP15]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, i8* [[NEXT_GEP4]], i32 0
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i8* [[TMP16]] to <4 x i8>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP18:%.*]] = add <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
-; CHECK-NEXT:    [[TMP19:%.*]] = bitcast i8* [[TMP16]] to <4 x i8>*
-; CHECK-NEXT:    store <4 x i8> [[TMP18]], <4 x i8>* [[TMP19]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i8*> poison, i8* [[NEXT_GEP4]], i32 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i8*> [[TMP6]], i8* [[NEXT_GEP5]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x i8*> [[TMP7]], i8* [[NEXT_GEP6]], i32 2
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x i8*> [[TMP8]], i8* [[NEXT_GEP7]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, <4 x i8*> [[TMP9]], i64 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8** [[TMP11]] to <4 x i8*>*
+; CHECK-NEXT:    store <4 x i8*> [[TMP10]], <4 x i8*>* [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, i8* [[NEXT_GEP4]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8* [[TMP13]] to <4 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP14]], align 1
+; CHECK-NEXT:    [[TMP15:%.*]] = add <4 x i8> [[WIDE_LOAD]], <i8 1, i8 1, i8 1, i8 1>
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8* [[TMP13]] to <4 x i8>*
+; CHECK-NEXT:    store <4 x i8> [[TMP15]], <4 x i8>* [[TMP16]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]