From 8531d202cf4e642d0e66b620ced23c822dc48d64 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 22 Jul 2021 10:58:31 +0100 Subject: [PATCH] [InstCombine] Fold (gep (oneuse(gep Ptr, Idx0)), Idx1) -> (gep Ptr, (add Idx0, Idx1)) (PR51069) As noticed on D106352, after we've folded "(select C, (gep Ptr, Idx), Ptr) -> (gep Ptr, (select C, Idx, 0))" if the inner Ptr was also a (now one use) gep we could then merge the geps, using the sum of the indices instead. I've limited this to basic 2-op geps - a more general case further down InstCombinerImpl.visitGetElementPtrInst doesn't have the one-use limitation but only creates the add if it can be created via SimplifyAddInst. https://alive2.llvm.org/ce/z/f8pLfD (Thanks Roman!) Differential Revision: https://reviews.llvm.org/D106450 --- .../InstCombine/InstructionCombining.cpp | 104 ++++++++++-------- .../InstCombine/gep-combine-loop-invariant.ll | 12 +- test/Transforms/InstCombine/gep-custom-dl.ll | 4 +- test/Transforms/InstCombine/getelementptr.ll | 4 +- test/Transforms/InstCombine/select-gep.ll | 12 +- test/Transforms/InstCombine/shift.ll | 4 +- .../AArch64/sve-vector-reverse.ll | 100 ++++++++--------- .../AArch64/vector-reverse-mask4.ll | 54 +++++---- .../ARM/mve-reduction-predselect.ll | 40 +++---- .../LoopVectorize/ARM/mve-reductions.ll | 26 ++--- .../x86-interleaved-accesses-masked-group.ll | 60 +++++----- .../LoopVectorize/consecutive-ptr-uniforms.ll | 4 +- .../LoopVectorize/interleaved-accesses.ll | 62 +++++------ 13 files changed, 243 insertions(+), 243 deletions(-) diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index 07b354f596a..8e8d8a75f79 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2076,54 +2076,70 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) { if (!shouldMergeGEPs(*cast(&GEP), *Src)) return nullptr; - // Try to reassociate loop invariant GEP chains to enable LICM. - if (LI && Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 && + if (Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 && Src->hasOneUse()) { - if (Loop *L = LI->getLoopFor(GEP.getParent())) { - Value *GO1 = GEP.getOperand(1); - Value *SO1 = Src->getOperand(1); - // Reassociate the two GEPs if SO1 is variant in the loop and GO1 is - // invariant: this breaks the dependence between GEPs and allows LICM - // to hoist the invariant part out of the loop. - if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) { - // We have to be careful here. - // We have something like: - // %src = getelementptr , * %base, %idx - // %gep = getelementptr , * %src, %idx2 - // If we just swap idx & idx2 then we could inadvertantly - // change %src from a vector to a scalar, or vice versa. - // Cases: - // 1) %base a scalar & idx a scalar & idx2 a vector - // => Swapping idx & idx2 turns %src into a vector type. - // 2) %base a scalar & idx a vector & idx2 a scalar - // => Swapping idx & idx2 turns %src in a scalar type - // 3) %base, %idx, and %idx2 are scalars - // => %src & %gep are scalars - // => swapping idx & idx2 is safe - // 4) %base a vector - // => %src is a vector - // => swapping idx & idx2 is safe. - auto *SO0 = Src->getOperand(0); - auto *SO0Ty = SO0->getType(); - if (!isa(GEPType) || // case 3 - isa(SO0Ty)) { // case 4 - Src->setOperand(1, GO1); - GEP.setOperand(1, SO1); - return &GEP; - } else { - // Case 1 or 2 - // -- have to recreate %src & %gep - // put NewSrc at same location as %src - Builder.SetInsertPoint(cast(PtrOp)); - auto *NewSrc = cast( - Builder.CreateGEP(GEPEltType, SO0, GO1, Src->getName())); - NewSrc->setIsInBounds(Src->isInBounds()); - auto *NewGEP = GetElementPtrInst::Create(GEPEltType, NewSrc, {SO1}); - NewGEP->setIsInBounds(GEP.isInBounds()); - return NewGEP; + Value *GO1 = GEP.getOperand(1); + Value *SO1 = Src->getOperand(1); + + if (LI) { + // Try to reassociate loop invariant GEP chains to enable LICM. + if (Loop *L = LI->getLoopFor(GEP.getParent())) { + // Reassociate the two GEPs if SO1 is variant in the loop and GO1 is + // invariant: this breaks the dependence between GEPs and allows LICM + // to hoist the invariant part out of the loop. + if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) { + // We have to be careful here. + // We have something like: + // %src = getelementptr , * %base, %idx + // %gep = getelementptr , * %src, %idx2 + // If we just swap idx & idx2 then we could inadvertantly + // change %src from a vector to a scalar, or vice versa. + // Cases: + // 1) %base a scalar & idx a scalar & idx2 a vector + // => Swapping idx & idx2 turns %src into a vector type. + // 2) %base a scalar & idx a vector & idx2 a scalar + // => Swapping idx & idx2 turns %src in a scalar type + // 3) %base, %idx, and %idx2 are scalars + // => %src & %gep are scalars + // => swapping idx & idx2 is safe + // 4) %base a vector + // => %src is a vector + // => swapping idx & idx2 is safe. + auto *SO0 = Src->getOperand(0); + auto *SO0Ty = SO0->getType(); + if (!isa(GEPType) || // case 3 + isa(SO0Ty)) { // case 4 + Src->setOperand(1, GO1); + GEP.setOperand(1, SO1); + return &GEP; + } else { + // Case 1 or 2 + // -- have to recreate %src & %gep + // put NewSrc at same location as %src + Builder.SetInsertPoint(cast(PtrOp)); + auto *NewSrc = cast( + Builder.CreateGEP(GEPEltType, SO0, GO1, Src->getName())); + NewSrc->setIsInBounds(Src->isInBounds()); + auto *NewGEP = + GetElementPtrInst::Create(GEPEltType, NewSrc, {SO1}); + NewGEP->setIsInBounds(GEP.isInBounds()); + return NewGEP; + } } } } + + // Fold (gep(gep(Ptr,Idx0),Idx1) -> gep(Ptr,add(Idx0,Idx1)) + if (GO1->getType() == SO1->getType()) { + bool NewInBounds = GEP.isInBounds() && Src->isInBounds(); + auto *NewIdx = + Builder.CreateAdd(GO1, SO1, GEP.getName() + ".idx", + /*HasNUW*/ false, /*HasNSW*/ NewInBounds); + auto *NewGEP = GetElementPtrInst::Create( + GEPEltType, Src->getPointerOperand(), {NewIdx}); + NewGEP->setIsInBounds(NewInBounds); + return NewGEP; + } } // Note that if our source is a gep chain itself then we wait for that diff --git a/test/Transforms/InstCombine/gep-combine-loop-invariant.ll b/test/Transforms/InstCombine/gep-combine-loop-invariant.ll index 82629200fa7..faaaff8fec9 100644 --- a/test/Transforms/InstCombine/gep-combine-loop-invariant.ll +++ b/test/Transforms/InstCombine/gep-combine-loop-invariant.ll @@ -8,10 +8,10 @@ define i32 @foo(i8* nocapture readnone %match, i32 %cur_match, i32 %best_len, i3 ; CHECK-LABEL: @foo( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[IDX_EXT2:%.*]] = zext i32 [[CUR_MATCH:%.*]] to i64 -; CHECK-NEXT: [[ADD_PTR4:%.*]] = getelementptr inbounds i8, i8* [[WIN:%.*]], i64 [[IDX_EXT2]] ; CHECK-NEXT: [[IDX_EXT1:%.*]] = zext i32 [[BEST_LEN:%.*]] to i64 -; CHECK-NEXT: [[ADD_PTR25:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR4]], i64 [[IDX_EXT1]] -; CHECK-NEXT: [[ADD_PTR36:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR25]], i64 -1 +; CHECK-NEXT: [[ADD_PTR25_IDX:%.*]] = add nuw nsw i64 [[IDX_EXT1]], [[IDX_EXT2]] +; CHECK-NEXT: [[ADD_PTR36_IDX:%.*]] = add nsw i64 [[ADD_PTR25_IDX]], -1 +; CHECK-NEXT: [[ADD_PTR36:%.*]] = getelementptr inbounds i8, i8* [[WIN:%.*]], i64 [[ADD_PTR36_IDX]] ; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[ADD_PTR36]] to i32* ; CHECK-NEXT: [[TMP1:%.*]] = load i32, i32* [[TMP0]], align 4 ; CHECK-NEXT: [[CMP7:%.*]] = icmp eq i32 [[TMP1]], [[SCAN_END:%.*]] @@ -20,9 +20,9 @@ define i32 @foo(i8* nocapture readnone %match, i32 %cur_match, i32 %best_len, i3 ; CHECK-NEXT: br label [[IF_THEN:%.*]] ; CHECK: do.body: ; CHECK-NEXT: [[IDX_EXT:%.*]] = zext i32 [[TMP4:%.*]] to i64 -; CHECK-NEXT: [[ADD_PTR:%.*]] = getelementptr inbounds i8, i8* [[WIN]], i64 [[IDX_EXT1]] -; CHECK-NEXT: [[ADD_PTR2:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR]], i64 -1 -; CHECK-NEXT: [[ADD_PTR3:%.*]] = getelementptr inbounds i8, i8* [[ADD_PTR2]], i64 [[IDX_EXT]] +; CHECK-NEXT: [[ADD_PTR2_IDX:%.*]] = add nuw nsw i64 [[IDX_EXT]], [[IDX_EXT1]] +; CHECK-NEXT: [[ADD_PTR3_IDX:%.*]] = add nsw i64 [[ADD_PTR2_IDX]], -1 +; CHECK-NEXT: [[ADD_PTR3:%.*]] = getelementptr inbounds i8, i8* [[WIN]], i64 [[ADD_PTR3_IDX]] ; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[ADD_PTR3]] to i32* ; CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[TMP3]], [[SCAN_END]] diff --git a/test/Transforms/InstCombine/gep-custom-dl.ll b/test/Transforms/InstCombine/gep-custom-dl.ll index 3de70f3c151..0980451d8ec 100644 --- a/test/Transforms/InstCombine/gep-custom-dl.ll +++ b/test/Transforms/InstCombine/gep-custom-dl.ll @@ -75,8 +75,8 @@ define void @test_evaluate_gep_as_ptrs_array(i8 addrspace(2)* %B) { define i32* @test4(i32* %I, i32 %C, i32 %D) { ; CHECK-LABEL: @test4( -; CHECK-NEXT: [[A:%.*]] = getelementptr i32, i32* [[I:%.*]], i32 [[C:%.*]] -; CHECK-NEXT: [[B:%.*]] = getelementptr i32, i32* [[A]], i32 [[D:%.*]] +; CHECK-NEXT: [[B_IDX:%.*]] = add i32 [[D:%.*]], [[C:%.*]] +; CHECK-NEXT: [[B:%.*]] = getelementptr i32, i32* [[I:%.*]], i32 [[B_IDX]] ; CHECK-NEXT: ret i32* [[B]] ; %A = getelementptr i32, i32* %I, i32 %C diff --git a/test/Transforms/InstCombine/getelementptr.ll b/test/Transforms/InstCombine/getelementptr.ll index f2a336767fd..688303d308c 100644 --- a/test/Transforms/InstCombine/getelementptr.ll +++ b/test/Transforms/InstCombine/getelementptr.ll @@ -115,8 +115,8 @@ define void @test_evaluate_gep_as_ptrs_array(i8 addrspace(2)* %B) { define i32* @test7(i32* %I, i64 %C, i64 %D) { ; CHECK-LABEL: @test7( -; CHECK-NEXT: [[A:%.*]] = getelementptr i32, i32* [[I:%.*]], i64 [[C:%.*]] -; CHECK-NEXT: [[B:%.*]] = getelementptr i32, i32* [[A]], i64 [[D:%.*]] +; CHECK-NEXT: [[B_IDX:%.*]] = add i64 [[D:%.*]], [[C:%.*]] +; CHECK-NEXT: [[B:%.*]] = getelementptr i32, i32* [[I:%.*]], i64 [[B_IDX]] ; CHECK-NEXT: ret i32* [[B]] ; %A = getelementptr i32, i32* %I, i64 %C diff --git a/test/Transforms/InstCombine/select-gep.ll b/test/Transforms/InstCombine/select-gep.ll index 519f0a94a13..2e112fe93a4 100644 --- a/test/Transforms/InstCombine/select-gep.ll +++ b/test/Transforms/InstCombine/select-gep.ll @@ -102,10 +102,10 @@ define i32* @test2b(i32* %p, i64 %x, i64 %y) { ; PR51069 define i32* @test2c(i32* %p, i64 %x, i64 %y) { ; CHECK-LABEL: @test2c( -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 [[X:%.*]] -; CHECK-NEXT: [[ICMP:%.*]] = icmp ugt i64 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[ICMP:%.*]] = icmp ugt i64 [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[SEL_IDX:%.*]] = select i1 [[ICMP]], i64 0, i64 6 -; CHECK-NEXT: [[SEL:%.*]] = getelementptr i32, i32* [[GEP1]], i64 [[SEL_IDX]] +; CHECK-NEXT: [[SEL_IDX1:%.*]] = add i64 [[SEL_IDX]], [[X]] +; CHECK-NEXT: [[SEL:%.*]] = getelementptr i32, i32* [[P:%.*]], i64 [[SEL_IDX1]] ; CHECK-NEXT: ret i32* [[SEL]] ; %gep1 = getelementptr inbounds i32, i32* %p, i64 %x @@ -118,10 +118,10 @@ define i32* @test2c(i32* %p, i64 %x, i64 %y) { ; PR51069 define i32* @test2d(i32* %p, i64 %x, i64 %y) { ; CHECK-LABEL: @test2d( -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 [[X:%.*]] -; CHECK-NEXT: [[ICMP:%.*]] = icmp ugt i64 [[X]], [[Y:%.*]] +; CHECK-NEXT: [[ICMP:%.*]] = icmp ugt i64 [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: [[SEL_IDX:%.*]] = select i1 [[ICMP]], i64 6, i64 0 -; CHECK-NEXT: [[SEL:%.*]] = getelementptr i32, i32* [[GEP1]], i64 [[SEL_IDX]] +; CHECK-NEXT: [[SEL_IDX1:%.*]] = add i64 [[SEL_IDX]], [[X]] +; CHECK-NEXT: [[SEL:%.*]] = getelementptr i32, i32* [[P:%.*]], i64 [[SEL_IDX1]] ; CHECK-NEXT: ret i32* [[SEL]] ; %gep1 = getelementptr inbounds i32, i32* %p, i64 %x diff --git a/test/Transforms/InstCombine/shift.ll b/test/Transforms/InstCombine/shift.ll index 2c5c4a7dbe1..f87de574bc9 100644 --- a/test/Transforms/InstCombine/shift.ll +++ b/test/Transforms/InstCombine/shift.ll @@ -1774,10 +1774,10 @@ define void @ashr_out_of_range(i177* %A) { define void @ashr_out_of_range_1(i177* %A) { ; CHECK-LABEL: @ashr_out_of_range_1( ; CHECK-NEXT: [[L:%.*]] = load i177, i177* [[A:%.*]], align 4 -; CHECK-NEXT: [[G11:%.*]] = getelementptr i177, i177* [[A]], i64 -1 ; CHECK-NEXT: [[B24_LOBIT:%.*]] = ashr i177 [[L]], 175 ; CHECK-NEXT: [[TMP1:%.*]] = trunc i177 [[B24_LOBIT]] to i64 -; CHECK-NEXT: [[G62:%.*]] = getelementptr i177, i177* [[G11]], i64 [[TMP1]] +; CHECK-NEXT: [[G62_IDX:%.*]] = add i64 [[TMP1]], -1 +; CHECK-NEXT: [[G62:%.*]] = getelementptr i177, i177* [[A]], i64 [[G62_IDX]] ; CHECK-NEXT: store i177 0, i177* [[G62]], align 4 ; CHECK-NEXT: ret void ; diff --git a/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll index 5cd5af5dd9e..d406c6de157 100644 --- a/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll +++ b/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll @@ -34,30 +34,30 @@ define void @vector_reverse_f64(i64 %N, double* %a, double* %b) #0{ ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[INDEX]], -1 ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], [[N]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[DOTNEG:%.*]] = mul i32 [[TMP7]], -8 -; CHECK-NEXT: [[TMP8:%.*]] = or i32 [[DOTNEG]], 1 -; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[TMP6]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[TMP10]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP11]], align 8, !alias.scope !0 +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[DOTNEG:%.*]] = mul i32 [[TMP6]], -8 +; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[DOTNEG]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[DOTIDX:%.*]] = add nsw i64 [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[DOTIDX]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast double* [[TMP9]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP10]], align 8, !alias.scope !0 ; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vector.reverse.nxv8f64( [[WIDE_LOAD]]) -; CHECK-NEXT: [[TMP12:%.*]] = fadd [[REVERSE]], shufflevector ( insertelement ( poison, double 1.000000e+00, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[REVERSE6:%.*]] = call @llvm.experimental.vector.reverse.nxv8f64( [[TMP12]]) -; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[DOTNEG7:%.*]] = mul i32 [[TMP14]], -8 -; CHECK-NEXT: [[TMP15:%.*]] = or i32 [[DOTNEG7]], 1 -; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds double, double* [[TMP13]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast double* [[TMP17]] to * -; CHECK-NEXT: store [[REVERSE6]], * [[TMP18]], align 8, !alias.scope !3, !noalias !0 -; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP20:%.*]] = shl i64 [[TMP19]], 3 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]] -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = fadd [[REVERSE]], shufflevector ( insertelement ( poison, double 1.000000e+00, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[REVERSE6:%.*]] = call @llvm.experimental.vector.reverse.nxv8f64( [[TMP11]]) +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[DOTNEG7:%.*]] = mul i32 [[TMP12]], -8 +; CHECK-NEXT: [[TMP13:%.*]] = or i32 [[DOTNEG7]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[DOTIDX8:%.*]] = add nsw i64 [[TMP5]], [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[DOTIDX8]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP15]] to * +; CHECK-NEXT: store [[REVERSE6]], * [[TMP16]], align 8, !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = shl i64 [[TMP17]], 3 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -72,8 +72,8 @@ define void @vector_reverse_f64(i64 %N, double* %a, double* %b) #0{ ; CHECK-NEXT: [[I_08_IN:%.*]] = phi i64 [ [[I_08:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[I_08]] = add nsw i64 [[I_08_IN]], -1 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[B]], i64 [[I_08]] -; CHECK-NEXT: [[TMP22:%.*]] = load double, double* [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP22]], 1.000000e+00 +; CHECK-NEXT: [[TMP20:%.*]] = load double, double* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP20]], 1.000000e+00 ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[I_08]] ; CHECK-NEXT: store double [[ADD]], double* [[ARRAYIDX1]], align 8 ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[I_08_IN]], 1 @@ -126,30 +126,30 @@ define void @vector_reverse_i64(i64 %N, i64* %a, i64* %b) #0 { ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = xor i64 [[INDEX]], -1 ; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], [[N]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[DOTNEG:%.*]] = mul i32 [[TMP7]], -8 -; CHECK-NEXT: [[TMP8:%.*]] = or i32 [[DOTNEG]], 1 -; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, i64* [[TMP6]], i64 [[TMP9]] -; CHECK-NEXT: [[TMP11:%.*]] = bitcast i64* [[TMP10]] to * -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP11]], align 8, !alias.scope !9 +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[DOTNEG:%.*]] = mul i32 [[TMP6]], -8 +; CHECK-NEXT: [[TMP7:%.*]] = or i32 [[DOTNEG]], 1 +; CHECK-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[DOTIDX:%.*]] = add nsw i64 [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[DOTIDX]] +; CHECK-NEXT: [[TMP10:%.*]] = bitcast i64* [[TMP9]] to * +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , * [[TMP10]], align 8, !alias.scope !9 ; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.experimental.vector.reverse.nxv8i64( [[WIDE_LOAD]]) -; CHECK-NEXT: [[TMP12:%.*]] = add [[REVERSE]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[REVERSE6:%.*]] = call @llvm.experimental.vector.reverse.nxv8i64( [[TMP12]]) -; CHECK-NEXT: [[TMP14:%.*]] = call i32 @llvm.vscale.i32() -; CHECK-NEXT: [[DOTNEG7:%.*]] = mul i32 [[TMP14]], -8 -; CHECK-NEXT: [[TMP15:%.*]] = or i32 [[DOTNEG7]], 1 -; CHECK-NEXT: [[TMP16:%.*]] = sext i32 [[TMP15]] to i64 -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i64, i64* [[TMP13]], i64 [[TMP16]] -; CHECK-NEXT: [[TMP18:%.*]] = bitcast i64* [[TMP17]] to * -; CHECK-NEXT: store [[REVERSE6]], * [[TMP18]], align 8, !alias.scope !12, !noalias !9 -; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP20:%.*]] = shl i64 [[TMP19]], 3 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP20]] -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP11:%.*]] = add [[REVERSE]], shufflevector ( insertelement ( poison, i64 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[REVERSE6:%.*]] = call @llvm.experimental.vector.reverse.nxv8i64( [[TMP11]]) +; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[DOTNEG7:%.*]] = mul i32 [[TMP12]], -8 +; CHECK-NEXT: [[TMP13:%.*]] = or i32 [[DOTNEG7]], 1 +; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64 +; CHECK-NEXT: [[DOTIDX8:%.*]] = add nsw i64 [[TMP5]], [[TMP14]] +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[DOTIDX8]] +; CHECK-NEXT: [[TMP16:%.*]] = bitcast i64* [[TMP15]] to * +; CHECK-NEXT: store [[REVERSE6]], * [[TMP16]], align 8, !alias.scope !12, !noalias !9 +; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP18:%.*]] = shl i64 [[TMP17]], 3 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -164,8 +164,8 @@ define void @vector_reverse_i64(i64 %N, i64* %a, i64* %b) #0 { ; CHECK-NEXT: [[I_09_IN:%.*]] = phi i64 [ [[I_09:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[I_09]] = add nsw i64 [[I_09_IN]], -1 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, i64* [[B]], i64 [[I_09]] -; CHECK-NEXT: [[TMP22:%.*]] = load i64, i64* [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[ADD:%.*]] = add i64 [[TMP22]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = load i64, i64* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[ADD:%.*]] = add i64 [[TMP20]], 1 ; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, i64* [[A]], i64 [[I_09]] ; CHECK-NEXT: store i64 [[ADD]], i64* [[ARRAYIDX2]], align 8 ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[I_09_IN]], 1 diff --git a/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll b/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll index 077d3c1f71b..4233760333a 100644 --- a/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll +++ b/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll @@ -44,32 +44,30 @@ define void @vector_reverse_mask_v4i1(double* %a, double* %cond, i64 %N) #0 { ; CHECK-NEXT: [[TMP4:%.*]] = bitcast double* [[TMP3]] to <4 x double>* ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, <4 x double>* [[TMP4]], align 8, !alias.scope !0 ; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x double> [[WIDE_LOAD]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, double* [[TMP2]], i64 -4 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds double, double* [[TMP5]], i64 -3 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast double* [[TMP6]] to <4 x double>* -; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x double>, <4 x double>* [[TMP7]], align 8, !alias.scope !0 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds double, double* [[TMP2]], i64 -7 +; CHECK-NEXT: [[TMP6:%.*]] = bitcast double* [[TMP5]] to <4 x double>* +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x double>, <4 x double>* [[TMP6]], align 8, !alias.scope !0 ; CHECK-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x double> [[WIDE_LOAD6]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = fcmp une <4 x double> [[REVERSE]], zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = fcmp une <4 x double> [[REVERSE7]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds double, double* [[TMP10]], i64 -3 -; CHECK-NEXT: [[REVERSE8:%.*]] = shufflevector <4 x i1> [[TMP8]], <4 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = bitcast double* [[TMP11]] to <4 x double>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP12]], i32 8, <4 x i1> [[REVERSE8]], <4 x double> poison), !alias.scope !3, !noalias !0 -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds double, double* [[TMP10]], i64 -4 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds double, double* [[TMP13]], i64 -3 -; CHECK-NEXT: [[REVERSE10:%.*]] = shufflevector <4 x i1> [[TMP9]], <4 x i1> poison, <4 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = bitcast double* [[TMP14]] to <4 x double>* -; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP15]], i32 8, <4 x i1> [[REVERSE10]], <4 x double> poison), !alias.scope !3, !noalias !0 -; CHECK-NEXT: [[TMP16:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], -; CHECK-NEXT: [[TMP17:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD11]], -; CHECK-NEXT: [[TMP18:%.*]] = bitcast double* [[TMP11]] to <4 x double>* -; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP16]], <4 x double>* [[TMP18]], i32 8, <4 x i1> [[REVERSE8]]), !alias.scope !3, !noalias !0 -; CHECK-NEXT: [[TMP19:%.*]] = bitcast double* [[TMP14]] to <4 x double>* -; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP17]], <4 x double>* [[TMP19]], i32 8, <4 x i1> [[REVERSE10]]), !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP7:%.*]] = fcmp une <4 x double> [[REVERSE]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = fcmp une <4 x double> [[REVERSE7]], zeroinitializer +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, double* [[TMP9]], i64 -3 +; CHECK-NEXT: [[REVERSE8:%.*]] = shufflevector <4 x i1> [[TMP7]], <4 x i1> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = bitcast double* [[TMP10]] to <4 x double>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP11]], i32 8, <4 x i1> [[REVERSE8]], <4 x double> poison), !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds double, double* [[TMP9]], i64 -7 +; CHECK-NEXT: [[REVERSE10:%.*]] = shufflevector <4 x i1> [[TMP8]], <4 x i1> poison, <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = bitcast double* [[TMP12]] to <4 x double>* +; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* nonnull [[TMP13]], i32 8, <4 x i1> [[REVERSE10]], <4 x double> poison), !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP14:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], +; CHECK-NEXT: [[TMP15:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD11]], +; CHECK-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP10]] to <4 x double>* +; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP14]], <4 x double>* [[TMP16]], i32 8, <4 x i1> [[REVERSE8]]), !alias.scope !3, !noalias !0 +; CHECK-NEXT: [[TMP17:%.*]] = bitcast double* [[TMP12]] to <4 x double>* +; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> [[TMP15]], <4 x double>* [[TMP17]], i32 8, <4 x i1> [[REVERSE10]]), !alias.scope !3, !noalias !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[N]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -84,13 +82,13 @@ define void @vector_reverse_mask_v4i1(double* %a, double* %cond, i64 %N) #0 { ; CHECK-NEXT: [[I_08_IN:%.*]] = phi i64 [ [[I_08:%.*]], [[FOR_INC:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[I_08]] = add nsw i64 [[I_08_IN]], -1 ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, double* [[COND]], i64 [[I_08]] -; CHECK-NEXT: [[TMP21:%.*]] = load double, double* [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[TOBOOL:%.*]] = fcmp une double [[TMP21]], 0.000000e+00 +; CHECK-NEXT: [[TMP19:%.*]] = load double, double* [[ARRAYIDX]], align 8 +; CHECK-NEXT: [[TOBOOL:%.*]] = fcmp une double [[TMP19]], 0.000000e+00 ; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[FOR_INC]] ; CHECK: if.then: ; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, double* [[A]], i64 [[I_08]] -; CHECK-NEXT: [[TMP22:%.*]] = load double, double* [[ARRAYIDX1]], align 8 -; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP22]], 1.000000e+00 +; CHECK-NEXT: [[TMP20:%.*]] = load double, double* [[ARRAYIDX1]], align 8 +; CHECK-NEXT: [[ADD:%.*]] = fadd double [[TMP20]], 1.000000e+00 ; CHECK-NEXT: store double [[ADD]], double* [[ARRAYIDX1]], align 8 ; CHECK-NEXT: br label [[FOR_INC]] ; CHECK: for.inc: diff --git a/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll b/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll index 22e28f91623..40d310d4524 100644 --- a/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll +++ b/test/Transforms/LoopVectorize/ARM/mve-reduction-predselect.ll @@ -22,13 +22,13 @@ define i32 @reduction_sum_single(i32* noalias nocapture %A) { ; CHECK-NEXT: [[TMP4]] = add i32 [[TMP3]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: -; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], [[LOOP2:!llvm.loop !.*]] +; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: ._crit_edge: ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] @@ -80,13 +80,13 @@ define i32 @reduction_sum(i32* noalias nocapture %A, i32* noalias nocapture %B) ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: -; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], [[LOOP5:!llvm.loop !.*]] +; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: ._crit_edge: ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP12]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] @@ -134,14 +134,14 @@ define i32 @reduction_prod(i32* noalias nocapture %A, i32* noalias nocapture %B) ; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP5]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP6]]) ; CHECK-NEXT: br i1 true, label [[DOT_CRIT_EDGE:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: -; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], [[LOOP7:!llvm.loop !.*]] +; CHECK-NEXT: br i1 undef, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: ._crit_edge: ; CHECK-NEXT: [[PROD_0_LCSSA:%.*]] = phi i32 [ undef, [[DOTLR_PH]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[PROD_0_LCSSA]] @@ -188,14 +188,14 @@ define i32 @reduction_and(i32* nocapture %A, i32* nocapture %B) { ; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP5]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP6]]) ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], [[LOOP9:!llvm.loop !.*]] +; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]] @@ -242,14 +242,14 @@ define i32 @reduction_or(i32* nocapture %A, i32* nocapture %B) { ; CHECK-NEXT: [[TMP6]] = or <4 x i32> [[VEC_PHI]], [[TMP5]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP6]]) ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], [[LOOP11:!llvm.loop !.*]] +; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]] @@ -296,14 +296,14 @@ define i32 @reduction_xor(i32* nocapture %A, i32* nocapture %B) { ; CHECK-NEXT: [[TMP6]] = xor <4 x i32> [[VEC_PHI]], [[TMP5]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP12:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP6]]) ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], [[LOOP13:!llvm.loop !.*]] +; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]] @@ -350,14 +350,14 @@ define float @reduction_fadd(float* nocapture %A, float* nocapture %B) { ; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP5]], <4 x float> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP14:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[TMP6]]) ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], [[LOOP15:!llvm.loop !.*]] +; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RESULT_0_LCSSA]] @@ -404,14 +404,14 @@ define float @reduction_fmul(float* nocapture %A, float* nocapture %B) { ; CHECK-NEXT: [[TMP6]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x float> [[TMP5]], <4 x float> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP16:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP8:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP6]]) ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], [[LOOP17:!llvm.loop !.*]] +; CHECK-NEXT: br i1 undef, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi float [ undef, [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret float [[RESULT_0_LCSSA]] @@ -453,7 +453,7 @@ define i32 @reduction_min(i32* nocapture %A, i32* nocapture %B) { ; CHECK-NEXT: [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP18:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -470,7 +470,7 @@ define i32 @reduction_min(i32* nocapture %A, i32* nocapture %B) { ; CHECK-NEXT: [[V0]] = select i1 [[C0]], i32 [[RESULT_08]], i32 [[L0]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP19:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[V0]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]] @@ -510,7 +510,7 @@ define i32 @reduction_max(i32* nocapture %A, i32* nocapture %B) { ; CHECK-NEXT: [[TMP3]] = select <4 x i1> [[TMP2]], <4 x i32> [[VEC_PHI]], <4 x i32> [[WIDE_LOAD]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 256 -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP20:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP3]]) ; CHECK-NEXT: br i1 false, label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -527,7 +527,7 @@ define i32 @reduction_max(i32* nocapture %A, i32* nocapture %B) { ; CHECK-NEXT: [[V0]] = select i1 [[C0]], i32 [[RESULT_08]], i32 [[L0]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i32 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVARS_IV_NEXT]], 257 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], [[LOOP21:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[V0]], [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[RESULT_0_LCSSA]] diff --git a/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/test/Transforms/LoopVectorize/ARM/mve-reductions.ll index 08a85b8305f..a7cc9ed411b 100644 --- a/test/Transforms/LoopVectorize/ARM/mve-reductions.ll +++ b/test/Transforms/LoopVectorize/ARM/mve-reductions.ll @@ -1154,28 +1154,26 @@ define i32 @reduction_interleave_group(i32 %n, i32* %arr) #0 { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[OFFSET_IDX]], 1 -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i32 -1 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>* -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i32 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <8 x i32>* +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP4]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[STRIDED_VEC1]]) -; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP7]], [[VEC_PHI]] -; CHECK-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[STRIDED_VEC]]) -; CHECK-NEXT: [[TMP10]] = add i32 [[TMP9]], [[TMP8]] +; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[STRIDED_VEC1]]) +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[TMP5]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[STRIDED_VEC]]) +; CHECK-NEXT: [[TMP8]] = add i32 [[TMP7]], [[TMP6]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP8]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] @@ -1191,7 +1189,7 @@ define i32 @reduction_interleave_group(i32 %n, i32* %arr) #0 { ; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[IV_NEXT]], [[N]] ; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT]], !llvm.loop [[LOOP27:![0-9]+]] ; CHECK: exit: -; CHECK-NEXT: [[RET_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RED_2]], [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: [[RET_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RED_2]], [[FOR_BODY]] ], [ [[TMP8]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[RET_LCSSA]] ; entry: diff --git a/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll index 884d743a1ba..a80140fea41 100644 --- a/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll +++ b/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll @@ -1439,19 +1439,17 @@ define dso_local void @masked_strided2(i8* noalias nocapture readonly %p, i8* no ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison) ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = or i32 [[TMP1]], 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC1]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i8> [[STRIDED_VEC1]], <8 x i8> [[STRIDED_VEC]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP8]], i32 [[TMP4]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>* -; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> -; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP10]], i32 1, <16 x i1> [[INTERLEAVED_MASK]]) +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC1]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP4]], <8 x i8> [[STRIDED_VEC1]], <8 x i8> [[STRIDED_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = sub <8 x i8> zeroinitializer, [[TMP5]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[TMP1]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <16 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <16 x i32> +; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP8]], i32 1, <16 x i1> [[INTERLEAVED_MASK]]) ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], -; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP11]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1024 +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP9]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; ENABLED_MASKED_STRIDED: for.end: ; ENABLED_MASKED_STRIDED-NEXT: ret void ; @@ -1877,19 +1875,17 @@ define dso_local void @masked_strided2_unknown_tc(i8* noalias nocapture readonly ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP5]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison) ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = or i32 [[TMP2]], 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = select <8 x i1> [[TMP7]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = sub <8 x i8> zeroinitializer, [[TMP8]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, i8* [[TMP10]], i32 [[TMP6]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to <16 x i8>* -; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP8]], <8 x i8> [[TMP9]], <16 x i32> -; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP12]], i32 1, <16 x i1> [[INTERLEAVED_MASK]]) +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = select <8 x i1> [[TMP6]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = sub <8 x i8> zeroinitializer, [[TMP7]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[TMP2]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP7]], <8 x i8> [[TMP8]], <16 x i32> +; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP10]], i32 1, <16 x i1> [[INTERLEAVED_MASK]]) ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 ; ENABLED_MASKED_STRIDED-NEXT: [[VEC_IND_NEXT]] = add <8 x i32> [[VEC_IND]], -; ENABLED_MASKED_STRIDED-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP13]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP11]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; ENABLED_MASKED_STRIDED: for.end: ; ENABLED_MASKED_STRIDED-NEXT: ret void ; @@ -2315,18 +2311,16 @@ define dso_local void @unconditional_masked_strided2_unknown_tc(i8* noalias noca ; ENABLED_MASKED_STRIDED-NEXT: [[WIDE_MASKED_VEC:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP3]], i32 1, <16 x i1> [[INTERLEAVED_MASK]], <16 x i8> poison) ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> ; ENABLED_MASKED_STRIDED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i8> [[WIDE_MASKED_VEC]], <16 x i8> poison, <8 x i32> -; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = or i32 [[TMP1]], 1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = select <8 x i1> [[TMP5]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = sub <8 x i8> zeroinitializer, [[TMP6]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 -1 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, i8* [[TMP8]], i32 [[TMP4]] -; ENABLED_MASKED_STRIDED-NEXT: [[TMP10:%.*]] = bitcast i8* [[TMP9]] to <16 x i8>* -; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP6]], <8 x i8> [[TMP7]], <16 x i32> -; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP10]], i32 1, <16 x i1> [[INTERLEAVED_MASK]]) +; ENABLED_MASKED_STRIDED-NEXT: [[TMP4:%.*]] = icmp slt <8 x i8> [[STRIDED_VEC]], [[STRIDED_VEC3]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP5:%.*]] = select <8 x i1> [[TMP4]], <8 x i8> [[STRIDED_VEC3]], <8 x i8> [[STRIDED_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP6:%.*]] = sub <8 x i8> zeroinitializer, [[TMP5]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[Q:%.*]], i32 [[TMP1]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <16 x i8>* +; ENABLED_MASKED_STRIDED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i8> [[TMP5]], <8 x i8> [[TMP6]], <16 x i32> +; ENABLED_MASKED_STRIDED-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[INTERLEAVED_VEC]], <16 x i8>* [[TMP8]], i32 1, <16 x i1> [[INTERLEAVED_MASK]]) ; ENABLED_MASKED_STRIDED-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 8 -; ENABLED_MASKED_STRIDED-NEXT: [[TMP11:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP11]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; ENABLED_MASKED_STRIDED-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; ENABLED_MASKED_STRIDED-NEXT: br i1 [[TMP9]], label [[FOR_END]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; ENABLED_MASKED_STRIDED: for.end: ; ENABLED_MASKED_STRIDED-NEXT: ret void ; diff --git a/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll b/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll index 0a127ad4ef8..89c6efa6945 100644 --- a/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll +++ b/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll @@ -50,8 +50,8 @@ for.end: ; CHECK: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; CHECK: %offset.idx = sub i64 %n, %index ; CHECK-NOT: getelementptr -; CHECK: %[[G0:.+]] = getelementptr inbounds i32, i32* %a, i64 -3 -; CHECK: getelementptr inbounds i32, i32* %[[G0]], i64 %offset.idx +; CHECK: %[[G0IDX:.+]] = add nsw i64 %offset.idx, -3 +; CHECK: getelementptr inbounds i32, i32* %a, i64 %[[G0IDX]] ; CHECK-NOT: getelementptr ; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body ; diff --git a/test/Transforms/LoopVectorize/interleaved-accesses.ll b/test/Transforms/LoopVectorize/interleaved-accesses.ll index 3e77d76a26a..e56b607342e 100644 --- a/test/Transforms/LoopVectorize/interleaved-accesses.ll +++ b/test/Transforms/LoopVectorize/interleaved-accesses.ll @@ -686,19 +686,17 @@ define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <8 x i32>, <8 x i32>* [[TMP1]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[OFFSET_IDX]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] +; CHECK-NEXT: [[TMP2:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[STRIDED_VEC]] ; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 -1 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP5]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <8 x i32>* -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <8 x i32> -; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <8 x i32>* +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> +; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512 +; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -762,19 +760,17 @@ define void @mixed_load3_store3(i32* nocapture %A) { ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[STRIDED_VEC]], [[VEC_IND]] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[NEXT_GEP]], i64 2 -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_IND]] -; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[VEC_IND]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i64 -2 -; CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <12 x i32>* -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <12 x i32> -; CHECK-NEXT: store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[STRIDED_VEC2]], [[VEC_IND]] +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[STRIDED_VEC3]], [[VEC_IND]] +; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[NEXT_GEP]] to <12 x i32>* +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> [[TMP7]], <12 x i32> +; CHECK-NEXT: store <12 x i32> [[INTERLEAVED_VEC]], <12 x i32>* [[TMP5]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -1319,23 +1315,21 @@ define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) { ; CHECK-NEXT: [[TMP3:%.*]] = or i64 [[OFFSET_IDX]], 2 ; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 4 ; CHECK-NEXT: [[TMP5:%.*]] = or i64 [[OFFSET_IDX]], 6 -; CHECK-NEXT: [[TMP6:%.*]] = or i64 [[OFFSET_IDX]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 -1 -; CHECK-NEXT: store i32 [[X:%.*]], i32* [[TMP7]], align 4 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[TMP5]] +; CHECK-NEXT: store i32 [[X:%.*]], i32* [[TMP6]], align 4 +; CHECK-NEXT: store i32 [[X]], i32* [[TMP7]], align 4 ; CHECK-NEXT: store i32 [[X]], i32* [[TMP8]], align 4 ; CHECK-NEXT: store i32 [[X]], i32* [[TMP9]], align 4 -; CHECK-NEXT: store i32 [[X]], i32* [[TMP10]], align 4 -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>* ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <8 x i32> -; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP13]], align 4 +; CHECK-NEXT: store <8 x i32> [[INTERLEAVED_VEC]], <8 x i32>* [[TMP11]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]