1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

Update LSR's logic that identifies a post-increment SCEV value.

One of the checks has been removed as it seem invalid.
The LoopStep size is always almost a 32-bit.

Differential Revision: https://reviews.llvm.org/D75079
This commit is contained in:
Sumanth Gundapaneni 2020-03-02 16:32:19 -06:00
parent d3dc81a22a
commit 77cbec7e52
5 changed files with 168 additions and 117 deletions

View File

@ -3530,9 +3530,6 @@ static bool mayUsePostIncMode(const TargetTransformInfo &TTI,
const SCEV *LoopStep = AR->getStepRecurrence(SE);
if (!isa<SCEVConstant>(LoopStep))
return false;
if (LU.AccessTy.getType()->getScalarSizeInBits() !=
LoopStep->getType()->getScalarSizeInBits())
return false;
// Check if a post-indexed load/store can be used.
if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {

View File

@ -1,7 +1,7 @@
; RUN: llc -march=hexagon < %s | FileCheck %s
; CHECK: [[REG0:(r[0-9]+)]] = add(r29
; CHECK: [[REG1:(r[0-9]+)]] = add([[REG0]],#4)
; CHECK: [[REG1:(r[0-9]+)]] = add([[REG0]],#8)
; CHECK-DAG: memd([[REG1]]+#8) =
; CHECK-DAG: memd([[REG1]]+#0) =

View File

@ -0,0 +1,50 @@
; RUN: llc -O3 -march=hexagon < %s | FileCheck %s
; Test to ensure LSR does not optimize out addrec of the outerloop.
; This will help to generate post-increment instructions, otherwise
; it end up an as extra reg+reg add inside the loop.
; CHECK: loop0(.LBB0_[[LOOP:.]],
; CHECK: .LBB0_[[LOOP]]:
; CHECK: memuh{{.*}}++
; CHECK: endloop
define dso_local signext i16 @foo(i16* nocapture readonly %filt, i16* nocapture readonly %inp, i32 %c1, i32 %c2) local_unnamed_addr {
entry:
%cmp28 = icmp sgt i32 %c1, 0
%cmp221 = icmp sgt i32 %c2, 0
%or.cond = and i1 %cmp28, %cmp221
br i1 %or.cond, label %for.cond1.preheader.us, label %for.cond.cleanup
for.cond1.preheader.us: ; preds = %entry, %for.cond1.for.cond.cleanup3_crit_edge.us
%filt.addr.032.us = phi i16* [ %scevgep, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %filt, %entry ]
%inp.addr.031.us = phi i16* [ %scevgep35, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ %inp, %entry ]
%l.030.us = phi i32 [ %inc11.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %entry ]
%sum0.029.us = phi i16 [ %add8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %entry ]
%scevgep = getelementptr i16, i16* %filt.addr.032.us, i32 %c2
br label %for.body4.us
for.body4.us: ; preds = %for.body4.us, %for.cond1.preheader.us
%z.025.us = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc.us, %for.body4.us ]
%filt.addr.124.us = phi i16* [ %filt.addr.032.us, %for.cond1.preheader.us ], [ %incdec.ptr.us, %for.body4.us ]
%inp.addr.123.us = phi i16* [ %inp.addr.031.us, %for.cond1.preheader.us ], [ %incdec.ptr5.us, %for.body4.us ]
%sum0.122.us = phi i16 [ %sum0.029.us, %for.cond1.preheader.us ], [ %add8.us, %for.body4.us ]
%incdec.ptr.us = getelementptr inbounds i16, i16* %filt.addr.124.us, i32 1
%0 = load i16, i16* %filt.addr.124.us, align 2
%incdec.ptr5.us = getelementptr inbounds i16, i16* %inp.addr.123.us, i32 1
%1 = load i16, i16* %inp.addr.123.us, align 2
%add.us = add i16 %0, %sum0.122.us
%add8.us = add i16 %add.us, %1
%inc.us = add nuw nsw i32 %z.025.us, 1
%exitcond = icmp eq i32 %inc.us, %c2
br i1 %exitcond, label %for.cond1.for.cond.cleanup3_crit_edge.us, label %for.body4.us
for.cond1.for.cond.cleanup3_crit_edge.us: ; preds = %for.body4.us
%scevgep35 = getelementptr i16, i16* %inp.addr.031.us, i32 %c2
%inc11.us = add nuw nsw i32 %l.030.us, 1
%exitcond36 = icmp eq i32 %inc11.us, %c1
br i1 %exitcond36, label %for.cond.cleanup, label %for.cond1.preheader.us
for.cond.cleanup: ; preds = %for.cond1.for.cond.cleanup3_crit_edge.us, %entry
%sum0.0.lcssa = phi i16 [ 0, %entry ], [ %add8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ]
ret i16 %sum0.0.lcssa
}

View File

@ -1778,11 +1778,11 @@ for.body: ; preds = %for.body, %for.body
define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* nocapture readonly %b, i32 %N) {
; CHECK-LABEL: half_short_mac:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: cbz r2, .LBB11_3
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: subs r3, r2, #1
; CHECK-NEXT: and r7, r2, #3
; CHECK-NEXT: and r6, r2, #3
; CHECK-NEXT: cmp r3, #3
; CHECK-NEXT: bhs .LBB11_4
; CHECK-NEXT: @ %bb.2:
@ -1799,33 +1799,33 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n
; CHECK-NEXT: vldr s0, .LCPI11_0
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: add.w lr, r3, r2, lsr #2
; CHECK-NEXT: movs r3, #0
; CHECK-NEXT: adds r3, r1, #4
; CHECK-NEXT: adds r2, r0, #4
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB11_5: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: adds r2, r1, r3
; CHECK-NEXT: adds r6, r0, r3
; CHECK-NEXT: vldr.16 s2, [r6, #6]
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldrsh.w r4, [r3, #2]
; CHECK-NEXT: vldr.16 s2, [r2, #2]
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: ldrsh.w r4, [r2, #2]
; CHECK-NEXT: ldrsh.w r5, [r2, #4]
; CHECK-NEXT: ldrsh.w r2, [r2, #6]
; CHECK-NEXT: vmov s8, r4
; CHECK-NEXT: vmov s6, r5
; CHECK-NEXT: vmov s4, r2
; CHECK-NEXT: vmov s4, r4
; CHECK-NEXT: vcvt.f16.s32 s4, s4
; CHECK-NEXT: ldrsh.w r4, [r3]
; CHECK-NEXT: vmul.f16 s2, s2, s4
; CHECK-NEXT: vldr.16 s4, [r6, #4]
; CHECK-NEXT: vldr.16 s4, [r2]
; CHECK-NEXT: vmov s6, r4
; CHECK-NEXT: vcvt.f16.s32 s6, s6
; CHECK-NEXT: ldrsh r5, [r3, #-2]
; CHECK-NEXT: ldrsh r4, [r3, #-4]
; CHECK-NEXT: vmul.f16 s4, s4, s6
; CHECK-NEXT: vldr.16 s6, [r6, #2]
; CHECK-NEXT: vcvt.f16.s32 s8, s8
; CHECK-NEXT: ldrsh r2, [r1, r3]
; CHECK-NEXT: vmul.f16 s6, s6, s8
; CHECK-NEXT: vldr.16 s8, [r6]
; CHECK-NEXT: vldr.16 s6, [r2, #-2]
; CHECK-NEXT: adds r3, #8
; CHECK-NEXT: vmov s10, r2
; CHECK-NEXT: vmov s8, r5
; CHECK-NEXT: vcvt.f16.s32 s8, s8
; CHECK-NEXT: vmov s10, r4
; CHECK-NEXT: vmul.f16 s6, s6, s8
; CHECK-NEXT: vldr.16 s8, [r2, #-4]
; CHECK-NEXT: vcvt.f16.s32 s10, s10
; CHECK-NEXT: adds r2, #8
; CHECK-NEXT: vmul.f16 s8, s8, s10
; CHECK-NEXT: vcvtb.f32.f16 s8, s8
; CHECK-NEXT: vcvtb.f32.f16 s6, s6
@ -1837,11 +1837,11 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: le lr, .LBB11_5
; CHECK-NEXT: .LBB11_6: @ %for.cond.cleanup.loopexit.unr-lcssa
; CHECK-NEXT: wls lr, r7, .LBB11_9
; CHECK-NEXT: wls lr, r6, .LBB11_9
; CHECK-NEXT: @ %bb.7: @ %for.body.epil.preheader
; CHECK-NEXT: add.w r0, r0, r12, lsl #1
; CHECK-NEXT: add.w r1, r1, r12, lsl #1
; CHECK-NEXT: mov lr, r7
; CHECK-NEXT: mov lr, r6
; CHECK-NEXT: .LBB11_8: @ %for.body.epil
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldrsh r2, [r1], #2
@ -1854,7 +1854,7 @@ define arm_aapcs_vfpcc float @half_short_mac(half* nocapture readonly %a, i16* n
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: le lr, .LBB11_8
; CHECK-NEXT: .LBB11_9: @ %for.cond.cleanup
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
; CHECK-NEXT: pop {r4, r5, r6, pc}
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: @ %bb.10:
; CHECK-NEXT: .LCPI11_0:

View File

@ -372,29 +372,29 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
; CHECK-NEXT: cmp.w r12, #0
; CHECK-NEXT: beq.w .LBB5_11
; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph
; CHECK-NEXT: add.w r4, r3, r12, lsl #2
; CHECK-NEXT: add.w r5, r1, r12
; CHECK-NEXT: cmp r4, r1
; CHECK-NEXT: add.w r6, r0, r12
; CHECK-NEXT: cset r7, hi
; CHECK-NEXT: cmp r5, r3
; CHECK-NEXT: cset r5, hi
; CHECK-NEXT: cmp r4, r0
; CHECK-NEXT: add.w r6, r3, r12, lsl #2
; CHECK-NEXT: add.w r4, r1, r12
; CHECK-NEXT: cmp r6, r1
; CHECK-NEXT: add.w r5, r0, r12
; CHECK-NEXT: cset lr, hi
; CHECK-NEXT: cmp r4, r3
; CHECK-NEXT: cset r4, hi
; CHECK-NEXT: cmp r6, r3
; CHECK-NEXT: cmp r6, r0
; CHECK-NEXT: cset r6, hi
; CHECK-NEXT: mov.w lr, #1
; CHECK-NEXT: ands r6, r4
; CHECK-NEXT: lsls r6, r6, #31
; CHECK-NEXT: itt eq
; CHECK-NEXT: andeq.w r4, r5, r7
; CHECK-NEXT: lslseq.w r4, r4, #31
; CHECK-NEXT: beq .LBB5_4
; CHECK-NEXT: cmp r5, r3
; CHECK-NEXT: cset r5, hi
; CHECK-NEXT: ands r5, r6
; CHECK-NEXT: movs r6, #1
; CHECK-NEXT: lsls r5, r5, #31
; CHECK-NEXT: itt eq
; CHECK-NEXT: andeq.w r5, r4, lr
; CHECK-NEXT: lslseq.w r5, r5, #31
; CHECK-NEXT: beq .LBB5_4
; CHECK-NEXT: @ %bb.2: @ %for.body.preheader
; CHECK-NEXT: sub.w r4, r12, #1
; CHECK-NEXT: and r9, r12, #3
; CHECK-NEXT: cmp r4, #3
; CHECK-NEXT: bhs .LBB5_6
; CHECK-NEXT: sub.w r5, r12, #1
; CHECK-NEXT: and r9, r12, #3
; CHECK-NEXT: cmp r5, #3
; CHECK-NEXT: bhs .LBB5_6
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: b .LBB5_8
@ -409,35 +409,37 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
; CHECK-NEXT: letp lr, .LBB5_5
; CHECK-NEXT: b .LBB5_11
; CHECK-NEXT: .LBB5_6: @ %for.body.preheader.new
; CHECK-NEXT: bic r7, r12, #3
; CHECK-NEXT: bic r5, r12, #3
; CHECK-NEXT: add.w r4, r3, #8
; CHECK-NEXT: subs r7, #4
; CHECK-NEXT: subs r5, #4
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: add.w lr, lr, r7, lsr #2
; CHECK-NEXT: add.w lr, r6, r5, lsr #2
; CHECK-NEXT: adds r5, r0, #3
; CHECK-NEXT: adds r6, r1, #1
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB5_7: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldrb.w r5, [r0, r12]
; CHECK-NEXT: add.w r7, r1, r12
; CHECK-NEXT: ldrb.w r6, [r1, r12]
; CHECK-NEXT: smlabb r5, r6, r5, r2
; CHECK-NEXT: str r5, [r4, #-8]
; CHECK-NEXT: add.w r5, r0, r12
; CHECK-NEXT: ldrb r6, [r7, #1]
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: ldrb.w r8, [r5, #1]
; CHECK-NEXT: smlabb r6, r6, r8, r2
; CHECK-NEXT: str r6, [r4, #-4]
; CHECK-NEXT: ldrb.w r8, [r5, #2]
; CHECK-NEXT: ldrb r6, [r7, #2]
; CHECK-NEXT: smlabb r6, r6, r8, r2
; CHECK-NEXT: str r6, [r4]
; CHECK-NEXT: ldrb r5, [r5, #3]
; CHECK-NEXT: ldrb r6, [r7, #3]
; CHECK-NEXT: smlabb r5, r6, r5, r2
; CHECK-NEXT: str r5, [r4, #4]
; CHECK-NEXT: adds r4, #16
; CHECK-NEXT: le lr, .LBB5_7
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldrb r8, [r5, #-3]
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: ldrb r7, [r6, #-1]
; CHECK-NEXT: smlabb r7, r7, r8, r2
; CHECK-NEXT: str r7, [r4, #-8]
; CHECK-NEXT: ldrb r8, [r5, #-2]
; CHECK-NEXT: ldrb r7, [r6]
; CHECK-NEXT: smlabb r7, r7, r8, r2
; CHECK-NEXT: str r7, [r4, #-4]
; CHECK-NEXT: ldrb r8, [r5, #-1]
; CHECK-NEXT: ldrb r7, [r6, #1]
; CHECK-NEXT: smlabb r7, r7, r8, r2
; CHECK-NEXT: str r7, [r4]
; CHECK-NEXT: ldrb.w r8, [r5]
; CHECK-NEXT: adds r5, #4
; CHECK-NEXT: ldrb r7, [r6, #2]
; CHECK-NEXT: adds r6, #4
; CHECK-NEXT: smlabb r7, r7, r8, r2
; CHECK-NEXT: str r7, [r4, #4]
; CHECK-NEXT: adds r4, #16
; CHECK-NEXT: le lr, .LBB5_7
; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup.loopexit.unr-lcssa
; CHECK-NEXT: wls lr, r9, .LBB5_11
; CHECK-NEXT: @ %bb.9: @ %for.body.epil.preheader
@ -447,10 +449,10 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_char(i8* nocapture readonly
; CHECK-NEXT: mov lr, r9
; CHECK-NEXT: .LBB5_10: @ %for.body.epil
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldrb r7, [r0], #1
; CHECK-NEXT: ldrb r6, [r1], #1
; CHECK-NEXT: smlabb r7, r6, r7, r2
; CHECK-NEXT: str r7, [r3], #4
; CHECK-NEXT: ldrb r6, [r0], #1
; CHECK-NEXT: ldrb r5, [r1], #1
; CHECK-NEXT: smlabb r6, r5, r6, r2
; CHECK-NEXT: str r6, [r3], #4
; CHECK-NEXT: le lr, .LBB5_10
; CHECK-NEXT: .LBB5_11: @ %for.cond.cleanup
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
@ -663,28 +665,28 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
; CHECK-NEXT: cmp.w r12, #0
; CHECK-NEXT: beq.w .LBB7_11
; CHECK-NEXT: @ %bb.1: @ %for.body.lr.ph
; CHECK-NEXT: add.w r4, r3, r12, lsl #2
; CHECK-NEXT: add.w r5, r1, r12
; CHECK-NEXT: cmp r4, r1
; CHECK-NEXT: add.w r6, r0, r12
; CHECK-NEXT: cset r7, hi
; CHECK-NEXT: add.w r6, r3, r12, lsl #2
; CHECK-NEXT: add.w r4, r1, r12
; CHECK-NEXT: cmp r6, r1
; CHECK-NEXT: add.w r5, r0, r12
; CHECK-NEXT: cset lr, hi
; CHECK-NEXT: cmp r4, r3
; CHECK-NEXT: cset r4, hi
; CHECK-NEXT: cmp r6, r0
; CHECK-NEXT: cset r6, hi
; CHECK-NEXT: cmp r5, r3
; CHECK-NEXT: cset r5, hi
; CHECK-NEXT: cmp r4, r0
; CHECK-NEXT: cset r4, hi
; CHECK-NEXT: cmp r6, r3
; CHECK-NEXT: cset r6, hi
; CHECK-NEXT: mov.w lr, #1
; CHECK-NEXT: ands r6, r4
; CHECK-NEXT: lsls r6, r6, #31
; CHECK-NEXT: ands r5, r6
; CHECK-NEXT: movs r6, #1
; CHECK-NEXT: lsls r5, r5, #31
; CHECK-NEXT: itt eq
; CHECK-NEXT: andeq.w r4, r5, r7
; CHECK-NEXT: lslseq.w r4, r4, #31
; CHECK-NEXT: andeq.w r5, r4, lr
; CHECK-NEXT: lslseq.w r5, r5, #31
; CHECK-NEXT: beq .LBB7_4
; CHECK-NEXT: @ %bb.2: @ %for.body.preheader
; CHECK-NEXT: sub.w r4, r12, #1
; CHECK-NEXT: sub.w r5, r12, #1
; CHECK-NEXT: and r9, r12, #3
; CHECK-NEXT: cmp r4, #3
; CHECK-NEXT: cmp r5, #3
; CHECK-NEXT: bhs .LBB7_6
; CHECK-NEXT: @ %bb.3:
; CHECK-NEXT: mov.w r12, #0
@ -700,33 +702,35 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
; CHECK-NEXT: letp lr, .LBB7_5
; CHECK-NEXT: b .LBB7_11
; CHECK-NEXT: .LBB7_6: @ %for.body.preheader.new
; CHECK-NEXT: bic r7, r12, #3
; CHECK-NEXT: bic r5, r12, #3
; CHECK-NEXT: add.w r4, r3, #8
; CHECK-NEXT: subs r7, #4
; CHECK-NEXT: subs r5, #4
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: add.w lr, lr, r7, lsr #2
; CHECK-NEXT: add.w lr, r6, r5, lsr #2
; CHECK-NEXT: adds r5, r0, #3
; CHECK-NEXT: adds r6, r1, #1
; CHECK-NEXT: dls lr, lr
; CHECK-NEXT: .LBB7_7: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldrb.w r5, [r0, r12]
; CHECK-NEXT: add.w r7, r1, r12
; CHECK-NEXT: ldrb.w r6, [r1, r12]
; CHECK-NEXT: smlabb r5, r6, r5, r2
; CHECK-NEXT: str r5, [r4, #-8]
; CHECK-NEXT: add.w r5, r0, r12
; CHECK-NEXT: ldrb r6, [r7, #1]
; CHECK-NEXT: ldrb r8, [r5, #-3]
; CHECK-NEXT: add.w r12, r12, #4
; CHECK-NEXT: ldrb.w r8, [r5, #1]
; CHECK-NEXT: smlabb r6, r6, r8, r2
; CHECK-NEXT: str r6, [r4, #-4]
; CHECK-NEXT: ldrb.w r8, [r5, #2]
; CHECK-NEXT: ldrb r6, [r7, #2]
; CHECK-NEXT: smlabb r6, r6, r8, r2
; CHECK-NEXT: str r6, [r4]
; CHECK-NEXT: ldrb r5, [r5, #3]
; CHECK-NEXT: ldrb r6, [r7, #3]
; CHECK-NEXT: smlabb r5, r6, r5, r2
; CHECK-NEXT: str r5, [r4, #4]
; CHECK-NEXT: ldrb r7, [r6, #-1]
; CHECK-NEXT: smlabb r7, r7, r8, r2
; CHECK-NEXT: str r7, [r4, #-8]
; CHECK-NEXT: ldrb r8, [r5, #-2]
; CHECK-NEXT: ldrb r7, [r6]
; CHECK-NEXT: smlabb r7, r7, r8, r2
; CHECK-NEXT: str r7, [r4, #-4]
; CHECK-NEXT: ldrb r8, [r5, #-1]
; CHECK-NEXT: ldrb r7, [r6, #1]
; CHECK-NEXT: smlabb r7, r7, r8, r2
; CHECK-NEXT: str r7, [r4]
; CHECK-NEXT: ldrb.w r8, [r5]
; CHECK-NEXT: adds r5, #4
; CHECK-NEXT: ldrb r7, [r6, #2]
; CHECK-NEXT: adds r6, #4
; CHECK-NEXT: smlabb r7, r7, r8, r2
; CHECK-NEXT: str r7, [r4, #4]
; CHECK-NEXT: adds r4, #16
; CHECK-NEXT: le lr, .LBB7_7
; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup.loopexit.unr-lcssa
@ -738,10 +742,10 @@ define arm_aapcs_vfpcc void @test_vec_mul_scalar_add_uchar(i8* nocapture readonl
; CHECK-NEXT: mov lr, r9
; CHECK-NEXT: .LBB7_10: @ %for.body.epil
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldrb r7, [r0], #1
; CHECK-NEXT: ldrb r6, [r1], #1
; CHECK-NEXT: smlabb r7, r6, r7, r2
; CHECK-NEXT: str r7, [r3], #4
; CHECK-NEXT: ldrb r6, [r0], #1
; CHECK-NEXT: ldrb r5, [r1], #1
; CHECK-NEXT: smlabb r6, r5, r6, r2
; CHECK-NEXT: str r6, [r3], #4
; CHECK-NEXT: le lr, .LBB7_10
; CHECK-NEXT: .LBB7_11: @ %for.cond.cleanup
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}