[LV] Preserve inbounds on created GEPs

Summary: This is a fix for PR23997. The loop vectorizer is not preserving the inbounds property of GEPs that it creates. This is inhibiting some optimizations. This patch preserves the inbounds property in the case where a load/store is being fed by an inbounds GEP. Reviewers: mkuper, javed.absar, hsaito Reviewed By: hsaito Subscribers: dcaballe, hsaito, llvm-commits Differential Revision: https://reviews.llvm.org/D46191 llvm-svn: 331269
2024-11-23 11:13:28 +01:00 · 2018-05-01 15:35:08 +00:00 · 2018-05-01 15:35:08 +00:00 · ccaecad889
commit ccaecad889
parent 58e484f1a9
12 changed files with 382 additions and 254 deletions
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -2254,6 +2254,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
  if (Group->isReverse())
    Index += (VF - 1) * Group->getFactor();

+  bool InBounds = false;
+  if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
+    InBounds = gep->isInBounds();
+
  for (unsigned Part = 0; Part < UF; Part++) {
    Value *NewPtr = getOrCreateScalarValue(Ptr, {Part, 0});

@ -2269,6 +2273,8 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
    //       A[i+2] = c;     // Member of index 2 (Current instruction)
    // Current pointer is pointed to A[i+2], adjust it to A[i].
    NewPtr = Builder.CreateGEP(NewPtr, Builder.getInt32(-Index));
+    if (InBounds)
+      cast<GetElementPtrInst>(NewPtr)->setIsInBounds(true);

    // Cast to the vector pointer type.
    NewPtrs.push_back(Builder.CreateBitCast(NewPtr, PtrTy));
@ -2405,17 +2411,30 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
  if (isMaskRequired)
    Mask = *BlockInMask;

+  bool InBounds = false;
+  if (auto *gep = dyn_cast<GetElementPtrInst>(
+          getLoadStorePointerOperand(Instr)->stripPointerCasts()))
+    InBounds = gep->isInBounds();
+
  const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
    // Calculate the pointer for the specific unroll-part.
-    Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
+    GetElementPtrInst *PartPtr = nullptr;

    if (Reverse) {
      // If the address is consecutive but reversed, then the
      // wide store needs to start at the last vector element.
-      PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
-      PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
+      PartPtr = cast<GetElementPtrInst>(
+          Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)));
+      PartPtr->setIsInBounds(InBounds);
+      PartPtr = cast<GetElementPtrInst>(
+          Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)));
+      PartPtr->setIsInBounds(InBounds);
      if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
        Mask[Part] = reverseVector(Mask[Part]);
+    } else {
+      PartPtr = cast<GetElementPtrInst>(
+          Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)));
+      PartPtr->setIsInBounds(InBounds);
    }

    return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
--- a/test/Transforms/LoopVectorize/AArch64/pr36032.ll
+++ b/test/Transforms/LoopVectorize/AArch64/pr36032.ll
@ -78,11 +78,11 @@ define void @_Z1dv() local_unnamed_addr #0 {
 ; CHECK-NEXT:    [[TMP20:%.*]] = add i32 [[CONV]], [[TMP19]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = zext i32 [[TMP20]] to i64
 ; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds [6 x i8], [6 x i8]* @c, i64 0, i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, i8* [[TMP22]], i32 0
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i8, i8* [[TMP22]], i32 0
 ; CHECK-NEXT:    [[TMP24:%.*]] = bitcast i8* [[TMP23]] to <4 x i8>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP24]], align 1, !alias.scope !0
 ; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, i8* [[CALL]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, i8* [[TMP25]], i32 0
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, i8* [[TMP25]], i32 0
 ; CHECK-NEXT:    [[TMP27:%.*]] = bitcast i8* [[TMP26]] to <4 x i8>*
 ; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP27]], align 1, !alias.scope !3, !noalias !0
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
--- a/test/Transforms/LoopVectorize/ARM/sphinx.ll
+++ b/test/Transforms/LoopVectorize/ARM/sphinx.ll
@ -49,18 +49,18 @@ define i32 @test(float* nocapture readonly %x) {
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr float, float* [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, float* [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = bitcast float* [[TMP3]] to <2 x float>*
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, float* [[T4]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr float, float* [[TMP5]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, float* [[TMP5]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = bitcast float* [[TMP6]] to <2 x float>*
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x float>, <2 x float>* [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = fpext <2 x float> [[TMP8]] to <2 x double>
 ; CHECK-NEXT:    [[TMP10:%.*]] = fmul fast <2 x double> [[TMP9]], [[TMP9]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[T6]], i32 [[TMP1]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr float, float* [[TMP11]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* [[TMP11]], i32 0
 ; CHECK-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <2 x float>*
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x float>, <2 x float>* [[TMP13]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = fpext <2 x float> [[WIDE_LOAD2]] to <2 x double>
--- a/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/test/Transforms/LoopVectorize/X86/masked_load_store.ll
--- a/test/Transforms/LoopVectorize/X86/metadata-enable.ll
+++ b/test/Transforms/LoopVectorize/X86/metadata-enable.ll
@ -2076,12 +2076,12 @@ define i32 @nopragma(i32* noalias nocapture %a, i32* noalias nocapture readonly
 ; O1VEC2-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
 ; O1VEC2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; O1VEC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; O1VEC2-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], i32 0
+; O1VEC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
 ; O1VEC2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
 ; O1VEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
 ; O1VEC2-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
 ; O1VEC2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; O1VEC2-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i32 0
+; O1VEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
 ; O1VEC2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
 ; O1VEC2-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], align 4
 ; O1VEC2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
@ -2121,12 +2121,12 @@ define i32 @nopragma(i32* noalias nocapture %a, i32* noalias nocapture readonly
 ; OzVEC2-NEXT:    [[INDUCTION:%.*]] = add <4 x i64> [[BROADCAST_SPLAT]], <i64 0, i64 1, i64 2, i64 3>
 ; OzVEC2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; OzVEC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP0]]
-; OzVEC2-NEXT:    [[TMP2:%.*]] = getelementptr i32, i32* [[TMP1]], i32 0
+; OzVEC2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[TMP1]], i32 0
 ; OzVEC2-NEXT:    [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>*
 ; OzVEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP3]], align 4
 ; OzVEC2-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT2]]
 ; OzVEC2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP0]]
-; OzVEC2-NEXT:    [[TMP6:%.*]] = getelementptr i32, i32* [[TMP5]], i32 0
+; OzVEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i32 0
 ; OzVEC2-NEXT:    [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
 ; OzVEC2-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* [[TMP7]], align 4
 ; OzVEC2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
--- a/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
+++ b/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
@ -49,7 +49,7 @@ for.end:
 ; CHECK:       %offset.idx = sub i64 %n, %index
 ; CHECK-NOT:   getelementptr
 ; CHECK:       %[[G0:.+]] = getelementptr inbounds i32, i32* %a, i64 -3
-; CHECK:       getelementptr i32, i32* %[[G0]], i64 %offset.idx
+; CHECK:       getelementptr inbounds i32, i32* %[[G0]], i64 %offset.idx
 ; CHECK-NOT:   getelementptr
 ; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
 ;
@ -162,7 +162,7 @@ for.end:
 ; INTER:       %offset.idx = sub i64 %n, %index
 ; INTER-NOT:   getelementptr
 ; INTER:       %[[G0:.+]] = getelementptr inbounds %pair, %pair* %p, i64 %offset.idx, i32 0
-; INTER:       getelementptr i32, i32* %[[G0]], i64 -6
+; INTER:       getelementptr inbounds i32, i32* %[[G0]], i64 -6
 ; INTER-NOT:   getelementptr
 ; INTER:       br i1 {{.*}}, label %middle.block, label %vector.body
 ;
--- a/test/Transforms/LoopVectorize/float-induction.ll
+++ b/test/Transforms/LoopVectorize/float-induction.ll
@ -54,7 +54,7 @@
 ; VEC4_INTERL2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
 ; VEC4_INTERL2-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>*
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP10]], align 4
-; VEC4_INTERL2-NEXT:    [[TMP11:%.*]] = getelementptr float, float* [[TMP9]], i64 4
+; VEC4_INTERL2-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* [[TMP9]], i64 4
 ; VEC4_INTERL2-NEXT:    [[TMP12:%.*]] = bitcast float* [[TMP11]] to <4 x float>*
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[STEP_ADD]], <4 x float>* [[TMP12]], align 4
 ; VEC4_INTERL2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
--- a/test/Transforms/LoopVectorize/induction-step.ll
+++ b/test/Transforms/LoopVectorize/induction-step.ll
@ -30,7 +30,7 @@
 ; CHECK-NEXT:    %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ]
 ; CHECK:         [[TMP8:%.*]] = add i64 %index, 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP8]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[TMP9]], i32 0
 ; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
 ; CHECK-NEXT:    store <8 x i32> %vec.ind, <8 x i32>* [[TMP11]], align 4
 ; CHECK:         %index.next = add i64 %index, 8
@ -101,7 +101,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; CHECK-NEXT:    %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ]
 ; CHECK:         [[TMP6:%.*]] = add i64 %index, 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
 ; CHECK-NEXT:    store <8 x i32> %vec.ind, <8 x i32>* [[TMP9]], align 4
 ; CHECK:         %index.next = add i64 %index, 8
@ -174,7 +174,7 @@ for.end6:                                         ; preds = %for.end6.loopexit,
 ; CHECK:         [[VEC_IND10:%.*]] = phi <8 x i32> [ [[INDUCTION7]], %vector.ph ], [ [[VEC_IND_NEXT11:%.*]], %vector.body ]
 ; CHECK:         [[TMP6:%.*]] = add i64 %index, 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
 ; CHECK-NEXT:    store <8 x i32> [[VEC_IND10]], <8 x i32>* [[TMP9]], align 4
 ; CHECK-NEXT:    %index.next = add i64 %index, 8
--- a/test/Transforms/LoopVectorize/induction.ll
+++ b/test/Transforms/LoopVectorize/induction.ll
@ -14,7 +14,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; CHECK-NEXT:    %vec.ind = phi <2 x i32> [ <i32 190, i32 191>, %vector.ph ], [ %vec.ind.next, %vector.body ]
 ; CHECK:         [[TMP3:%.*]] = add i64 %index, 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* %A, i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
 ; CHECK-NEXT:    store <2 x i32> %vec.ind, <2 x i32>* [[TMP6]], align 4
 ; CHECK:         %index.next = add i64 %index, 2
@ -107,7 +107,7 @@ loopexit:
 ; UNROLL:       %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
 ; UNROLL-NOT:   add i64 {{.*}}, 4
 ; UNROLL:       %[[g1:.+]] = getelementptr inbounds i64, i64* %a, i64 %index
-; UNROLL:       getelementptr i64, i64* %[[g1]], i64 2
+; UNROLL:       getelementptr inbounds i64, i64* %[[g1]], i64 2

 define i64 @scalarize_induction_variable_01(i64 *%a, i64 %n) {
 entry:
@ -353,7 +353,7 @@ for.end:
 ; UNROLL:   %[[I2:.+]] = or i32 %index, 2
 ; UNROLL:   %[[E0:.+]] = sext i32 %index to i64
 ; UNROLL:   %[[G0:.+]] = getelementptr inbounds i32, i32* %a, i64 %[[E0]]
-; UNROLL:   getelementptr i32, i32* %[[G0]], i64 2
+; UNROLL:   getelementptr inbounds i32, i32* %[[G0]], i64 2
 ; UNROLL: pred.udiv.if:
 ; UNROLL:   udiv i32 {{.*}}, %index
 ; UNROLL: pred.udiv.if{{[0-9]+}}:
@ -711,7 +711,7 @@ exit:
 ; CHECK:   %offset.idx = add i32 %i, %index
 ; CHECK:   %[[A1:.*]] = add i32 %offset.idx, 0
 ; CHECK:   %[[G1:.*]] = getelementptr inbounds i32, i32* %a, i32 %[[A1]]
-; CHECK:   %[[G3:.*]] = getelementptr i32, i32* %[[G1]], i32 0
+; CHECK:   %[[G3:.*]] = getelementptr inbounds i32, i32* %[[G1]], i32 0
 ; CHECK:   %[[B1:.*]] = bitcast i32* %[[G3]] to <2 x i32>*
 ; CHECK:   store <2 x i32> %vec.ind, <2 x i32>* %[[B1]]
 ; CHECK:   %index.next = add i32 %index, 2
@ -751,7 +751,7 @@ exit:
 ; UNROLL:   %[[G1:.*]] = getelementptr inbounds i32, i32* %a, i64 %[[S1]]
 ; UNROLL:   %[[B1:.*]] = bitcast i32* %[[G1]] to <2 x i32>*
 ; UNROLL:   store <2 x i32> %vec.ind, <2 x i32>* %[[B1]]
-; UNROLL:   %[[G2:.*]] = getelementptr i32, i32* %[[G1]], i64 2
+; UNROLL:   %[[G2:.*]] = getelementptr inbounds i32, i32* %[[G1]], i64 2
 ; UNROLL:   %[[B2:.*]] = bitcast i32* %[[G2]] to <2 x i32>*
 ; UNROLL:   store <2 x i32> %step.add, <2 x i32>* %[[B2]]
 ; UNROLL:   %index.next = add i32 %index, 4
@ -780,7 +780,7 @@ exit:
 ; CHECK:         [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 2>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
 ; CHECK:         [[TMP3:%.*]] = add i64 %index, 0
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* %a, i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
 ; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP6]], align 4
 ; CHECK-NEXT:    %index.next = add i64 %index, 2
--- a/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/test/Transforms/LoopVectorize/interleaved-accesses.ll
@ -245,7 +245,7 @@ for.body:                                         ; preds = %for.body, %entry

 ; CHECK-LABEL: @test_reversed_load2_store2(
 ; CHECK: %[[G0:.+]] = getelementptr inbounds %struct.ST2, %struct.ST2* %A, i64 %offset.idx, i32 0
-; CHECK: %[[G1:.+]] = getelementptr i32, i32* %[[G0]], i64 -6
+; CHECK: %[[G1:.+]] = getelementptr inbounds i32, i32* %[[G0]], i64 -6
 ; CHECK: %[[B0:.+]] = bitcast i32* %[[G1]] to <8 x i32>*
 ; CHECK: %wide.vec = load <8 x i32>, <8 x i32>* %[[B0]], align 4
 ; CHECK: shufflevector <8 x i32> %wide.vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@ -255,7 +255,7 @@ for.body:                                         ; preds = %for.body, %entry
 ; CHECK: add nsw <4 x i32>
 ; CHECK: sub nsw <4 x i32>
 ; CHECK: %[[G2:.+]] = getelementptr inbounds %struct.ST2, %struct.ST2* %B, i64 %offset.idx, i32 1
-; CHECK: %[[G3:.+]] = getelementptr i32, i32* %[[G2]], i64 -7
+; CHECK: %[[G3:.+]] = getelementptr inbounds i32, i32* %[[G2]], i64 -7
 ; CHECK: %[[B1:.+]] = bitcast i32* %[[G3]] to <8 x i32>*
 ; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK: shufflevector <4 x i32> {{.*}}, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
--- a/test/Transforms/LoopVectorize/pr23997.ll
+++ b/test/Transforms/LoopVectorize/pr23997.ll
@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -loop-vectorize -dce -instcombine < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:1"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Ensure that the 'inbounds' is preserved on the GEPs that feed the load and store in the loop.
+define void @foo(i8 addrspace(1)* align 8 dereferenceable_or_null(16), i8 addrspace(1)* align 8 dereferenceable_or_null(8), i64) #0 {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[PREHEADER:%.*]]
+; CHECK:       preheader:
+; CHECK-NEXT:    [[DOT10:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP0:%.*]], i64 16
+; CHECK-NEXT:    [[DOT11:%.*]] = bitcast i8 addrspace(1)* [[DOT10]] to i8 addrspace(1)* addrspace(1)*
+; CHECK-NEXT:    [[DOT12:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[TMP1:%.*]], i64 16
+; CHECK-NEXT:    [[DOT13:%.*]] = bitcast i8 addrspace(1)* [[DOT12]] to i8 addrspace(1)* addrspace(1)*
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[TMP2:%.*]], 1
+; CHECK-NEXT:    [[UMAX:%.*]] = select i1 [[TMP3]], i64 [[TMP2]], i64 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt i64 [[TMP2]], 1
+; CHECK-NEXT:    [[UMAX1:%.*]] = select i1 [[TMP4]], i64 [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[UMAX1]], 3
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP5]], 8
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP0]], i64 [[TMP6]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, i8 addrspace(1)* [[TMP1]], i64 [[TMP6]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult i8 addrspace(1)* [[DOT10]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult i8 addrspace(1)* [[DOT12]], [[SCEVGEP]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[UMAX]], -16
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT13]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP7]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP8]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP7]], i64 4
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP9]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP10]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP7]], i64 8
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP11]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP12]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP7]], i64 12
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP13]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i8 addrspace(1)*>, <4 x i8 addrspace(1)*> addrspace(1)* [[TMP14]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT11]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP16:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP15]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    store <4 x i8 addrspace(1)*> [[WIDE_LOAD]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP16]], align 8, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP15]], i64 4
+; CHECK-NEXT:    [[TMP18:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP17]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    store <4 x i8 addrspace(1)*> [[WIDE_LOAD6]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP18]], align 8, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP15]], i64 8
+; CHECK-NEXT:    [[TMP20:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP19]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    store <4 x i8 addrspace(1)*> [[WIDE_LOAD7]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP20]], align 8, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[TMP15]], i64 12
+; CHECK-NEXT:    [[TMP22:%.*]] = bitcast i8 addrspace(1)* addrspace(1)* [[TMP21]] to <4 x i8 addrspace(1)*> addrspace(1)*
+; CHECK-NEXT:    store <4 x i8 addrspace(1)*> [[WIDE_LOAD8]], <4 x i8 addrspace(1)*> addrspace(1)* [[TMP22]], align 8, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[PREHEADER]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDVARS_IV3:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT4:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[DOT18:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT13]], i64 [[INDVARS_IV3]]
+; CHECK-NEXT:    [[V:%.*]] = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT18]], align 8
+; CHECK-NEXT:    [[DOT20:%.*]] = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* [[DOT11]], i64 [[INDVARS_IV3]]
+; CHECK-NEXT:    store i8 addrspace(1)* [[V]], i8 addrspace(1)* addrspace(1)* [[DOT20]], align 8
+; CHECK-NEXT:    [[INDVARS_IV_NEXT4]] = add nuw nsw i64 [[INDVARS_IV3]], 1
+; CHECK-NEXT:    [[DOT21:%.*]] = icmp ult i64 [[INDVARS_IV_NEXT4]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[DOT21]], label [[LOOP]], label [[LOOPEXIT]], !llvm.loop !7
+; CHECK:       loopexit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %preheader
+
+preheader:                                       ; preds = %4
+  %.10 = getelementptr inbounds i8, i8 addrspace(1)* %0, i64 16
+  %.11 = bitcast i8 addrspace(1)* %.10 to i8 addrspace(1)* addrspace(1)*
+  %.12 = getelementptr inbounds i8, i8 addrspace(1)* %1, i64 16
+  %.13 = bitcast i8 addrspace(1)* %.12 to i8 addrspace(1)* addrspace(1)*
+  br label %loop
+
+loop:
+  %indvars.iv3 = phi i64 [ 0, %preheader ], [ %indvars.iv.next4, %loop ]
+  %.18 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %.13, i64 %indvars.iv3
+  %v = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %.18, align 8
+  %.20 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %.11, i64 %indvars.iv3
+  store i8 addrspace(1)* %v, i8 addrspace(1)* addrspace(1)* %.20, align 8
+  %indvars.iv.next4 = add nuw nsw i64 %indvars.iv3, 1
+  %.21 = icmp ult i64 %indvars.iv.next4, %2
+  br i1 %.21, label %loop, label %loopexit
+
+loopexit:
+  ret void
+}
+
+attributes #0 = { uwtable "target-cpu"="skylake" "target-features"="+sse2,+cx16,+sahf,-tbm,-avx512ifma,-sha,-gfni,-fma4,-vpclmulqdq,+prfchw,+bmi2,-cldemote,+fsgsbase,+xsavec,+popcnt,+aes,-avx512bitalg,+xsaves,-avx512er,-avx512vnni,-avx512vpopcntdq,-clwb,-avx512f,-clzero,-pku,+mmx,-lwp,-rdpid,-xop,+rdseed,-waitpkg,-ibt,-sse4a,-avx512bw,+clflushopt,+xsave,-avx512vbmi2,-avx512vl,-avx512cd,+avx,-vaes,+rtm,+fma,+bmi,+rdrnd,-mwaitx,+sse4.1,+sse4.2,+avx2,-wbnoinvd,+sse,+lzcnt,+pclmul,-prefetchwt1,+f16c,+ssse3,+sgx,-shstk,+cmov,-avx512vbmi,+movbe,+xsaveopt,-avx512dq,+adx,-avx512pf,+sse3" }
+
+!0 = !{i32 0, i32 2147483646}
+!1 = !{}
--- a/test/Transforms/LoopVectorize/scalar_after_vectorization.ll
+++ b/test/Transforms/LoopVectorize/scalar_after_vectorization.ll
@ -13,7 +13,7 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 ; CHECK:   %[[T4:.+]] = getelementptr inbounds i32, i32* %a, i64 %[[T3]]
 ; CHECK:   %[[T5:.+]] = bitcast i32* %[[T4]] to <4 x i32>*
 ; CHECK:   load <4 x i32>, <4 x i32>* %[[T5]], align 4
-; CHECK:   %[[T6:.+]] = getelementptr i32, i32* %[[T4]], i64 4
+; CHECK:   %[[T6:.+]] = getelementptr inbounds i32, i32* %[[T4]], i64 4
 ; CHECK:   %[[T7:.+]] = bitcast i32* %[[T6]] to <4 x i32>*
 ; CHECK:   load <4 x i32>, <4 x i32>* %[[T7]], align 4
 ; CHECK:   br {{.*}}, label %middle.block, label %vector.body
@ -31,10 +31,10 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 ; NO-IC:   %[[T7:.+]] = sub nsw i64 %[[T5]], %x
 ; NO-IC:   %[[T8:.+]] = getelementptr inbounds i32, i32* %a, i64 %[[T6]]
 ; NO-IC:   %[[T9:.+]] = getelementptr inbounds i32, i32* %a, i64 %[[T7]]
-; NO-IC:   %[[T10:.+]] = getelementptr i32, i32* %[[T8]], i32 0
+; NO-IC:   %[[T10:.+]] = getelementptr inbounds i32, i32* %[[T8]], i32 0
 ; NO-IC:   %[[T11:.+]] = bitcast i32* %[[T10]] to <4 x i32>*
 ; NO-IC:   load <4 x i32>, <4 x i32>* %[[T11]], align 4
-; NO-IC:   %[[T12:.+]] = getelementptr i32, i32* %[[T8]], i32 4
+; NO-IC:   %[[T12:.+]] = getelementptr inbounds i32, i32* %[[T8]], i32 4
 ; NO-IC:   %[[T13:.+]] = bitcast i32* %[[T12]] to <4 x i32>*
 ; NO-IC:   load <4 x i32>, <4 x i32>* %[[T13]], align 4
 ; NO-IC:   br {{.*}}, label %middle.block, label %vector.body