1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00
llvm-mirror/test/CodeGen/Thumb2/mve-postinc-dct.ll

1658 lines
77 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
%struct.DCT_InstanceTypeDef = type { float*, i32, i32 }
define void @DCT_mve1(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
; CHECK-LABEL: DCT_mve1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
; CHECK-NEXT: ldr r3, [r0, #4]
; CHECK-NEXT: sub.w r12, r3, #1
; CHECK-NEXT: cmp.w r12, #2
; CHECK-NEXT: blo .LBB0_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: ldr r5, [r0, #8]
; CHECK-NEXT: ldr r3, [r0]
; CHECK-NEXT: add.w r3, r3, r5, lsl #2
; CHECK-NEXT: movs r0, #1
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: lsl.w r9, r5, #2
; CHECK-NEXT: .LBB0_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB0_3 Depth 2
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: mov r6, r1
; CHECK-NEXT: mov r7, r3
; CHECK-NEXT: dlstp.32 lr, r5
; CHECK-NEXT: .LBB0_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vldrw.u32 q1, [r6], #16
; CHECK-NEXT: vldrw.u32 q2, [r7], #16
; CHECK-NEXT: vfma.f32 q0, q2, q1
; CHECK-NEXT: letp lr, .LBB0_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: vadd.f32 s4, s2, s3
; CHECK-NEXT: add.w r7, r2, r0, lsl #2
; CHECK-NEXT: vadd.f32 s0, s0, s1
; CHECK-NEXT: adds r0, #1
; CHECK-NEXT: add r3, r9
; CHECK-NEXT: cmp r0, r12
; CHECK-NEXT: vadd.f32 s0, s0, s4
; CHECK-NEXT: vstr s0, [r7]
; CHECK-NEXT: bne .LBB0_2
; CHECK-NEXT: .LBB0_5: @ %for.cond.cleanup
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
entry:
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
%0 = load i32, i32* %NumInputs, align 4
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
%1 = load i32, i32* %NumFilters, align 4
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
%2 = load float*, float** %pDCTCoefs, align 4
%cmp = icmp ugt i32 %0, 1
tail call void @llvm.assume(i1 %cmp)
%sub = add i32 %1, -1
%cmp350 = icmp ugt i32 %sub, 1
br i1 %cmp350, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%n.rnd.up = add i32 %0, 3
%n.vec = and i32 %n.rnd.up, -4
br label %for.body
for.cond.cleanup: ; preds = %middle.block, %entry
ret void
for.body: ; preds = %for.body.preheader, %middle.block
%k2.051 = phi i32 [ %add16, %middle.block ], [ 1, %for.body.preheader ]
%mul4 = mul i32 %k2.051, %0
br label %vector.body
vector.body: ; preds = %vector.body, %for.body
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %10, %vector.body ]
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
%3 = getelementptr inbounds float, float* %pIn, i32 %index
%4 = bitcast float* %3 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%5 = add i32 %index, %mul4
%6 = getelementptr inbounds float, float* %2, i32 %5
%7 = bitcast float* %6 to <4 x float>*
%wide.masked.load53 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%8 = fmul fast <4 x float> %wide.masked.load53, %wide.masked.load
%9 = fadd fast <4 x float> %8, %vec.phi
%10 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi
%index.next = add i32 %index, 4
%11 = icmp eq i32 %index.next, %n.vec
br i1 %11, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%12 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %10)
%arrayidx14 = getelementptr inbounds float, float* %pOut, i32 %k2.051
store float %12, float* %arrayidx14, align 4
%add16 = add nuw i32 %k2.051, 1
%exitcond52.not = icmp eq i32 %add16, %sub
br i1 %exitcond52.not, label %for.cond.cleanup, label %for.body
}
define void @DCT_mve2(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
; CHECK-LABEL: DCT_mve2:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
; CHECK-NEXT: subs r1, #2
; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo .LBB1_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: ldr.w r12, [r0, #8]
; CHECK-NEXT: movs r4, #1
; CHECK-NEXT: ldr r3, [r0]
; CHECK-NEXT: add.w r11, r3, r12, lsl #2
; CHECK-NEXT: add.w r7, r3, r12, lsl #3
; CHECK-NEXT: lsl.w r9, r12, #3
; CHECK-NEXT: .LBB1_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB1_3 Depth 2
; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: add.w r10, r4, #1
; CHECK-NEXT: mov r3, r11
; CHECK-NEXT: mov r0, r7
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: dlstp.32 lr, r12
; CHECK-NEXT: .LBB1_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB1_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vldrw.u32 q2, [r5], #16
; CHECK-NEXT: vldrw.u32 q3, [r3], #16
; CHECK-NEXT: vfma.f32 q1, q3, q2
; CHECK-NEXT: vldrw.u32 q3, [r0], #16
; CHECK-NEXT: vfma.f32 q0, q3, q2
; CHECK-NEXT: letp lr, .LBB1_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: vadd.f32 s8, s2, s3
; CHECK-NEXT: add.w r0, r2, r10, lsl #2
; CHECK-NEXT: vadd.f32 s0, s0, s1
; CHECK-NEXT: add r11, r9
; CHECK-NEXT: vadd.f32 s2, s6, s7
; CHECK-NEXT: add r7, r9
; CHECK-NEXT: vadd.f32 s4, s4, s5
; CHECK-NEXT: vadd.f32 s0, s0, s8
; CHECK-NEXT: vadd.f32 s2, s4, s2
; CHECK-NEXT: vstr s0, [r0]
; CHECK-NEXT: add.w r0, r2, r4, lsl #2
; CHECK-NEXT: adds r4, #2
; CHECK-NEXT: cmp r4, r1
; CHECK-NEXT: vstr s2, [r0]
; CHECK-NEXT: blo .LBB1_2
; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
%0 = load i32, i32* %NumInputs, align 4
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
%1 = load i32, i32* %NumFilters, align 4
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
%2 = load float*, float** %pDCTCoefs, align 4
%cmp = icmp ugt i32 %0, 1
tail call void @llvm.assume(i1 %cmp)
%sub = add i32 %1, -2
%cmp371 = icmp ugt i32 %sub, 1
br i1 %cmp371, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%n.rnd.up = add i32 %0, 3
%n.vec = and i32 %n.rnd.up, -4
br label %for.body
for.cond.cleanup: ; preds = %middle.block, %entry
ret void
for.body: ; preds = %for.body.preheader, %middle.block
%k2.072 = phi i32 [ %add25, %middle.block ], [ 1, %for.body.preheader ]
%mul4 = mul i32 %k2.072, %0
%add = add nuw i32 %k2.072, 1
%mul5 = mul i32 %add, %0
br label %vector.body
vector.body: ; preds = %vector.body, %for.body
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %15, %vector.body ]
%vec.phi73 = phi <4 x float> [ zeroinitializer, %for.body ], [ %16, %vector.body ]
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
%3 = getelementptr inbounds float, float* %pIn, i32 %index
%4 = bitcast float* %3 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%5 = add i32 %index, %mul4
%6 = getelementptr inbounds float, float* %2, i32 %5
%7 = bitcast float* %6 to <4 x float>*
%wide.masked.load74 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%8 = fmul fast <4 x float> %wide.masked.load74, %wide.masked.load
%9 = fadd fast <4 x float> %8, %vec.phi73
%10 = add i32 %index, %mul5
%11 = getelementptr inbounds float, float* %2, i32 %10
%12 = bitcast float* %11 to <4 x float>*
%wide.masked.load75 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%13 = fmul fast <4 x float> %wide.masked.load75, %wide.masked.load
%14 = fadd fast <4 x float> %13, %vec.phi
%15 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi
%16 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi73
%index.next = add i32 %index, 4
%17 = icmp eq i32 %index.next, %n.vec
br i1 %17, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%18 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %16)
%19 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %15)
%arrayidx21 = getelementptr inbounds float, float* %pOut, i32 %k2.072
store float %18, float* %arrayidx21, align 4
%arrayidx23 = getelementptr inbounds float, float* %pOut, i32 %add
store float %19, float* %arrayidx23, align 4
%add25 = add i32 %k2.072, 2
%cmp3 = icmp ult i32 %add25, %sub
br i1 %cmp3, label %for.body, label %for.cond.cleanup
}
define void @DCT_mve3(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
; CHECK-LABEL: DCT_mve3:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
; CHECK-NEXT: .pad #24
; CHECK-NEXT: sub sp, #24
; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: subs r1, #3
; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo .LBB2_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: ldr r3, [r0, #8]
; CHECK-NEXT: movs r5, #1
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: add.w r0, r3, r3, lsl #1
; CHECK-NEXT: add.w r9, r1, r3, lsl #2
; CHECK-NEXT: add.w r12, r1, r3, lsl #3
; CHECK-NEXT: adds r3, #3
; CHECK-NEXT: bic r3, r3, #3
; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: add.w r10, r1, r0, lsl #2
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: lsl.w r11, r0, #2
; CHECK-NEXT: add.w r1, r5, r3, lsr #2
; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB2_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB2_3 Depth 2
; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
; CHECK-NEXT: adds r0, r5, #2
; CHECK-NEXT: adds r2, r5, #1
; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: mov r3, r9
; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: mov r4, r10
; CHECK-NEXT: vmov q2, q0
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: dlstp.32 lr, r7
; CHECK-NEXT: .LBB2_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB2_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vldrw.u32 q3, [r6], #16
; CHECK-NEXT: vldrw.u32 q4, [r3], #16
; CHECK-NEXT: vfma.f32 q1, q4, q3
; CHECK-NEXT: vldrw.u32 q4, [r0], #16
; CHECK-NEXT: vfma.f32 q2, q4, q3
; CHECK-NEXT: vldrw.u32 q4, [r4], #16
; CHECK-NEXT: vfma.f32 q0, q4, q3
; CHECK-NEXT: letp lr, .LBB2_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB2_2 Depth=1
; CHECK-NEXT: vadd.f32 s12, s10, s11
; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: vadd.f32 s8, s8, s9
; CHECK-NEXT: add r9, r11
; CHECK-NEXT: vadd.f32 s10, s6, s7
; CHECK-NEXT: add.w r0, r1, r2, lsl #2
; CHECK-NEXT: vadd.f32 s4, s4, s5
; CHECK-NEXT: add r12, r11
; CHECK-NEXT: vadd.f32 s6, s2, s3
; CHECK-NEXT: add r10, r11
; CHECK-NEXT: vadd.f32 s0, s0, s1
; CHECK-NEXT: vadd.f32 s2, s8, s12
; CHECK-NEXT: vadd.f32 s4, s4, s10
; CHECK-NEXT: vadd.f32 s0, s0, s6
; CHECK-NEXT: vstr s2, [r0]
; CHECK-NEXT: add.w r0, r1, r5, lsl #2
; CHECK-NEXT: adds r5, #3
; CHECK-NEXT: vstr s4, [r0]
; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: add.w r0, r1, r0, lsl #2
; CHECK-NEXT: vstr s0, [r0]
; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: cmp r5, r0
; CHECK-NEXT: blo .LBB2_2
; CHECK-NEXT: .LBB2_5: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #24
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
%0 = load i32, i32* %NumInputs, align 4
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
%1 = load i32, i32* %NumFilters, align 4
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
%2 = load float*, float** %pDCTCoefs, align 4
%cmp = icmp ugt i32 %0, 1
tail call void @llvm.assume(i1 %cmp)
%sub = add i32 %1, -3
%cmp392 = icmp ugt i32 %sub, 1
br i1 %cmp392, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%n.rnd.up = add i32 %0, 3
%n.vec = and i32 %n.rnd.up, -4
br label %for.body
for.cond.cleanup: ; preds = %middle.block, %entry
ret void
for.body: ; preds = %for.body.preheader, %middle.block
%k2.093 = phi i32 [ %add34, %middle.block ], [ 1, %for.body.preheader ]
%mul4 = mul i32 %k2.093, %0
%add = add nuw i32 %k2.093, 1
%mul5 = mul i32 %add, %0
%add6 = add i32 %k2.093, 2
%mul7 = mul i32 %add6, %0
br label %vector.body
vector.body: ; preds = %vector.body, %for.body
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %20, %vector.body ]
%vec.phi94 = phi <4 x float> [ zeroinitializer, %for.body ], [ %21, %vector.body ]
%vec.phi95 = phi <4 x float> [ zeroinitializer, %for.body ], [ %22, %vector.body ]
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
%3 = getelementptr inbounds float, float* %pIn, i32 %index
%4 = bitcast float* %3 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%5 = add i32 %index, %mul4
%6 = getelementptr inbounds float, float* %2, i32 %5
%7 = bitcast float* %6 to <4 x float>*
%wide.masked.load96 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%8 = fmul fast <4 x float> %wide.masked.load96, %wide.masked.load
%9 = fadd fast <4 x float> %8, %vec.phi95
%10 = add i32 %index, %mul5
%11 = getelementptr inbounds float, float* %2, i32 %10
%12 = bitcast float* %11 to <4 x float>*
%wide.masked.load97 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%13 = fmul fast <4 x float> %wide.masked.load97, %wide.masked.load
%14 = fadd fast <4 x float> %13, %vec.phi94
%15 = add i32 %index, %mul7
%16 = getelementptr inbounds float, float* %2, i32 %15
%17 = bitcast float* %16 to <4 x float>*
%wide.masked.load98 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%18 = fmul fast <4 x float> %wide.masked.load98, %wide.masked.load
%19 = fadd fast <4 x float> %18, %vec.phi
%20 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi
%21 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi94
%22 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi95
%index.next = add i32 %index, 4
%23 = icmp eq i32 %index.next, %n.vec
br i1 %23, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%24 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %22)
%25 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %21)
%26 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %20)
%arrayidx28 = getelementptr inbounds float, float* %pOut, i32 %k2.093
store float %24, float* %arrayidx28, align 4
%arrayidx30 = getelementptr inbounds float, float* %pOut, i32 %add
store float %25, float* %arrayidx30, align 4
%arrayidx32 = getelementptr inbounds float, float* %pOut, i32 %add6
store float %26, float* %arrayidx32, align 4
%add34 = add i32 %k2.093, 3
%cmp3 = icmp ult i32 %add34, %sub
br i1 %cmp3, label %for.body, label %for.cond.cleanup
}
define void @DCT_mve4(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
; CHECK-LABEL: DCT_mve4:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: .pad #40
; CHECK-NEXT: sub sp, #40
; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: subs r1, #4
; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo.w .LBB3_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: ldr r2, [r0, #8]
; CHECK-NEXT: movs r6, #1
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: add.w r0, r2, r2, lsl #1
; CHECK-NEXT: add.w r12, r1, r2, lsl #2
; CHECK-NEXT: add.w r8, r1, r2, lsl #3
; CHECK-NEXT: add.w r9, r1, r2, lsl #4
; CHECK-NEXT: add.w r11, r1, r0, lsl #2
; CHECK-NEXT: adds r0, r2, #3
; CHECK-NEXT: bic r0, r0, #3
; CHECK-NEXT: subs r0, #4
; CHECK-NEXT: add.w r0, r6, r0, lsr #2
; CHECK-NEXT: strd r0, r2, [sp, #8] @ 8-byte Folded Spill
; CHECK-NEXT: lsls r0, r2, #4
; CHECK-NEXT: ldrd r2, r7, [sp, #8] @ 8-byte Folded Reload
; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: .LBB3_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB3_3 Depth 2
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: adds r0, r6, #3
; CHECK-NEXT: str r0, [sp, #36] @ 4-byte Spill
; CHECK-NEXT: adds r0, r6, #2
; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: str r0, [sp, #32] @ 4-byte Spill
; CHECK-NEXT: adds r0, r6, #1
; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill
; CHECK-NEXT: mov r3, r12
; CHECK-NEXT: mov r0, r8
; CHECK-NEXT: mov r5, r11
; CHECK-NEXT: mov r4, r9
; CHECK-NEXT: vmov q1, q0
; CHECK-NEXT: vmov q2, q0
; CHECK-NEXT: vmov q3, q0
; CHECK-NEXT: dlstp.32 lr, r7
; CHECK-NEXT: .LBB3_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vldrw.u32 q4, [r1], #16
; CHECK-NEXT: vldrw.u32 q5, [r0], #16
; CHECK-NEXT: vfma.f32 q3, q5, q4
; CHECK-NEXT: vldrw.u32 q5, [r3], #16
; CHECK-NEXT: vfma.f32 q2, q5, q4
; CHECK-NEXT: vldrw.u32 q5, [r5], #16
; CHECK-NEXT: vfma.f32 q1, q5, q4
; CHECK-NEXT: vldrw.u32 q5, [r4], #16
; CHECK-NEXT: vfma.f32 q0, q5, q4
; CHECK-NEXT: letp lr, .LBB3_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1
; CHECK-NEXT: vadd.f32 s16, s14, s15
; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload
; CHECK-NEXT: vadd.f32 s12, s12, s13
; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: vadd.f32 s14, s10, s11
; CHECK-NEXT: vadd.f32 s8, s8, s9
; CHECK-NEXT: add.w r0, r1, r0, lsl #2
; CHECK-NEXT: vadd.f32 s10, s6, s7
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vadd.f32 s4, s4, s5
; CHECK-NEXT: vadd.f32 s6, s2, s3
; CHECK-NEXT: vadd.f32 s0, s0, s1
; CHECK-NEXT: vadd.f32 s2, s12, s16
; CHECK-NEXT: vadd.f32 s8, s8, s14
; CHECK-NEXT: vadd.f32 s4, s4, s10
; CHECK-NEXT: vadd.f32 s0, s0, s6
; CHECK-NEXT: vstr s2, [r0]
; CHECK-NEXT: add.w r0, r1, r6, lsl #2
; CHECK-NEXT: adds r6, #4
; CHECK-NEXT: vstr s8, [r0]
; CHECK-NEXT: ldr r0, [sp, #32] @ 4-byte Reload
; CHECK-NEXT: add.w r0, r1, r0, lsl #2
; CHECK-NEXT: vstr s4, [r0]
; CHECK-NEXT: ldr r0, [sp, #36] @ 4-byte Reload
; CHECK-NEXT: add.w r0, r1, r0, lsl #2
; CHECK-NEXT: vstr s0, [r0]
; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: add r12, r0
; CHECK-NEXT: add r8, r0
; CHECK-NEXT: add r11, r0
; CHECK-NEXT: add r9, r0
; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: cmp r6, r0
; CHECK-NEXT: blo .LBB3_2
; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #40
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
%0 = load i32, i32* %NumInputs, align 4
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
%1 = load i32, i32* %NumFilters, align 4
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
%2 = load float*, float** %pDCTCoefs, align 4
%cmp = icmp ugt i32 %0, 1
tail call void @llvm.assume(i1 %cmp)
%sub = add i32 %1, -4
%cmp3113 = icmp ugt i32 %sub, 1
br i1 %cmp3113, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%n.rnd.up = add i32 %0, 3
%n.vec = and i32 %n.rnd.up, -4
br label %for.body
for.cond.cleanup: ; preds = %middle.block, %entry
ret void
for.body: ; preds = %for.body.preheader, %middle.block
%k2.0114 = phi i32 [ %add43, %middle.block ], [ 1, %for.body.preheader ]
%mul4 = mul i32 %k2.0114, %0
%add = add nuw nsw i32 %k2.0114, 1
%mul5 = mul i32 %add, %0
%add6 = add nuw nsw i32 %k2.0114, 2
%mul7 = mul i32 %add6, %0
%add8 = add i32 %k2.0114, 3
%mul9 = mul i32 %add8, %0
br label %vector.body
vector.body: ; preds = %vector.body, %for.body
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %25, %vector.body ]
%vec.phi115 = phi <4 x float> [ zeroinitializer, %for.body ], [ %26, %vector.body ]
%vec.phi116 = phi <4 x float> [ zeroinitializer, %for.body ], [ %27, %vector.body ]
%vec.phi117 = phi <4 x float> [ zeroinitializer, %for.body ], [ %28, %vector.body ]
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
%3 = getelementptr inbounds float, float* %pIn, i32 %index
%4 = bitcast float* %3 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%5 = add i32 %index, %mul4
%6 = getelementptr inbounds float, float* %2, i32 %5
%7 = bitcast float* %6 to <4 x float>*
%wide.masked.load118 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%8 = fmul fast <4 x float> %wide.masked.load118, %wide.masked.load
%9 = fadd fast <4 x float> %8, %vec.phi116
%10 = add i32 %index, %mul5
%11 = getelementptr inbounds float, float* %2, i32 %10
%12 = bitcast float* %11 to <4 x float>*
%wide.masked.load119 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%13 = fmul fast <4 x float> %wide.masked.load119, %wide.masked.load
%14 = fadd fast <4 x float> %13, %vec.phi117
%15 = add i32 %index, %mul7
%16 = getelementptr inbounds float, float* %2, i32 %15
%17 = bitcast float* %16 to <4 x float>*
%wide.masked.load120 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%18 = fmul fast <4 x float> %wide.masked.load120, %wide.masked.load
%19 = fadd fast <4 x float> %18, %vec.phi115
%20 = add i32 %index, %mul9
%21 = getelementptr inbounds float, float* %2, i32 %20
%22 = bitcast float* %21 to <4 x float>*
%wide.masked.load121 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%23 = fmul fast <4 x float> %wide.masked.load121, %wide.masked.load
%24 = fadd fast <4 x float> %23, %vec.phi
%25 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi
%26 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi115
%27 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi116
%28 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi117
%index.next = add i32 %index, 4
%29 = icmp eq i32 %index.next, %n.vec
br i1 %29, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%30 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %28)
%31 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %27)
%32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %26)
%33 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %25)
%arrayidx35 = getelementptr inbounds float, float* %pOut, i32 %k2.0114
store float %31, float* %arrayidx35, align 4
%arrayidx37 = getelementptr inbounds float, float* %pOut, i32 %add
store float %30, float* %arrayidx37, align 4
%arrayidx39 = getelementptr inbounds float, float* %pOut, i32 %add6
store float %32, float* %arrayidx39, align 4
%arrayidx41 = getelementptr inbounds float, float* %pOut, i32 %add8
store float %33, float* %arrayidx41, align 4
%add43 = add i32 %k2.0114, 4
%cmp3 = icmp ult i32 %add43, %sub
br i1 %cmp3, label %for.body, label %for.cond.cleanup
}
define void @DCT_mve5(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
; CHECK-LABEL: DCT_mve5:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: .pad #32
; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
; CHECK-NEXT: subs r1, #5
; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo.w .LBB4_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: ldr r3, [r0, #8]
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: adds r0, r3, #3
; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: bic r0, r0, #3
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: add.w r8, r1, r3, lsl #2
; CHECK-NEXT: subs r1, r0, #4
; CHECK-NEXT: movs r0, #1
; CHECK-NEXT: lsls r5, r3, #2
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: add.w r1, r3, r3, lsl #2
; CHECK-NEXT: lsls r1, r1, #2
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: .LBB4_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB4_3 Depth 2
; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: adds r1, r0, #4
; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: add.w r10, r0, #2
; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
; CHECK-NEXT: adds r1, r0, #3
; CHECK-NEXT: add.w r11, r0, #1
; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
; CHECK-NEXT: mov r3, r8
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: vmov q3, q1
; CHECK-NEXT: vmov q2, q1
; CHECK-NEXT: vmov q4, q1
; CHECK-NEXT: dlstp.32 lr, r7
; CHECK-NEXT: .LBB4_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB4_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: add.w r9, r3, r5
; CHECK-NEXT: vldrw.u32 q5, [r4], #16
; CHECK-NEXT: vldrw.u32 q6, [r3], #16
; CHECK-NEXT: vfma.f32 q3, q6, q5
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: add.w r12, r9, r5
; CHECK-NEXT: vldrw.u32 q6, [r9]
; CHECK-NEXT: vfma.f32 q4, q6, q5
; CHECK-NEXT: add.w r6, r12, r5
; CHECK-NEXT: vldrw.u32 q6, [r12]
; CHECK-NEXT: vfma.f32 q2, q6, q5
; CHECK-NEXT: adds r7, r6, r5
; CHECK-NEXT: vldrw.u32 q6, [r6]
; CHECK-NEXT: vfma.f32 q0, q6, q5
; CHECK-NEXT: vldrw.u32 q6, [r7]
; CHECK-NEXT: vfma.f32 q1, q6, q5
; CHECK-NEXT: letp lr, .LBB4_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB4_2 Depth=1
; CHECK-NEXT: vadd.f32 s20, s18, s19
; CHECK-NEXT: add.w r1, r2, r11, lsl #2
; CHECK-NEXT: vadd.f32 s16, s16, s17
; CHECK-NEXT: vadd.f32 s18, s14, s15
; CHECK-NEXT: vadd.f32 s12, s12, s13
; CHECK-NEXT: vadd.f32 s14, s6, s7
; CHECK-NEXT: vadd.f32 s4, s4, s5
; CHECK-NEXT: vadd.f32 s6, s10, s11
; CHECK-NEXT: vadd.f32 s8, s8, s9
; CHECK-NEXT: vadd.f32 s10, s2, s3
; CHECK-NEXT: vadd.f32 s0, s0, s1
; CHECK-NEXT: vadd.f32 s2, s16, s20
; CHECK-NEXT: vadd.f32 s12, s12, s18
; CHECK-NEXT: vadd.f32 s4, s4, s14
; CHECK-NEXT: vadd.f32 s6, s8, s6
; CHECK-NEXT: vadd.f32 s0, s0, s10
; CHECK-NEXT: vstr s2, [r1]
; CHECK-NEXT: add.w r1, r2, r0, lsl #2
; CHECK-NEXT: adds r0, #5
; CHECK-NEXT: vstr s12, [r1]
; CHECK-NEXT: add.w r1, r2, r10, lsl #2
; CHECK-NEXT: vstr s6, [r1]
; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s0, [r1]
; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s4, [r1]
; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: add r8, r1
; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: blo.w .LBB4_2
; CHECK-NEXT: .LBB4_5: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #32
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
%0 = load i32, i32* %NumInputs, align 4
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
%1 = load i32, i32* %NumFilters, align 4
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
%2 = load float*, float** %pDCTCoefs, align 4
%cmp = icmp ugt i32 %0, 1
tail call void @llvm.assume(i1 %cmp)
%sub = add i32 %1, -5
%cmp3134 = icmp ugt i32 %sub, 1
br i1 %cmp3134, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%n.rnd.up = add i32 %0, 3
%n.vec = and i32 %n.rnd.up, -4
br label %for.body
for.cond.cleanup: ; preds = %middle.block, %entry
ret void
for.body: ; preds = %for.body.preheader, %middle.block
%k2.0135 = phi i32 [ %add52, %middle.block ], [ 1, %for.body.preheader ]
%mul4 = mul i32 %k2.0135, %0
%add = add nuw i32 %k2.0135, 1
%mul5 = mul i32 %add, %0
%add6 = add i32 %k2.0135, 2
%mul7 = mul i32 %add6, %0
%add8 = add i32 %k2.0135, 3
%mul9 = mul i32 %add8, %0
%add10 = add i32 %k2.0135, 4
%mul11 = mul i32 %add10, %0
br label %vector.body
vector.body: ; preds = %vector.body, %for.body
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %30, %vector.body ]
%vec.phi136 = phi <4 x float> [ zeroinitializer, %for.body ], [ %31, %vector.body ]
%vec.phi137 = phi <4 x float> [ zeroinitializer, %for.body ], [ %32, %vector.body ]
%vec.phi138 = phi <4 x float> [ zeroinitializer, %for.body ], [ %33, %vector.body ]
%vec.phi139 = phi <4 x float> [ zeroinitializer, %for.body ], [ %34, %vector.body ]
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
%3 = getelementptr inbounds float, float* %pIn, i32 %index
%4 = bitcast float* %3 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%5 = add i32 %index, %mul4
%6 = getelementptr inbounds float, float* %2, i32 %5
%7 = bitcast float* %6 to <4 x float>*
%wide.masked.load140 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%8 = fmul fast <4 x float> %wide.masked.load140, %wide.masked.load
%9 = fadd fast <4 x float> %8, %vec.phi137
%10 = add i32 %index, %mul5
%11 = getelementptr inbounds float, float* %2, i32 %10
%12 = bitcast float* %11 to <4 x float>*
%wide.masked.load141 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%13 = fmul fast <4 x float> %wide.masked.load141, %wide.masked.load
%14 = fadd fast <4 x float> %13, %vec.phi139
%15 = add i32 %index, %mul7
%16 = getelementptr inbounds float, float* %2, i32 %15
%17 = bitcast float* %16 to <4 x float>*
%wide.masked.load142 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%18 = fmul fast <4 x float> %wide.masked.load142, %wide.masked.load
%19 = fadd fast <4 x float> %18, %vec.phi138
%20 = add i32 %index, %mul9
%21 = getelementptr inbounds float, float* %2, i32 %20
%22 = bitcast float* %21 to <4 x float>*
%wide.masked.load143 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%23 = fmul fast <4 x float> %wide.masked.load143, %wide.masked.load
%24 = fadd fast <4 x float> %23, %vec.phi136
%25 = add i32 %index, %mul11
%26 = getelementptr inbounds float, float* %2, i32 %25
%27 = bitcast float* %26 to <4 x float>*
%wide.masked.load144 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%28 = fmul fast <4 x float> %wide.masked.load144, %wide.masked.load
%29 = fadd fast <4 x float> %28, %vec.phi
%30 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi
%31 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi136
%32 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi137
%33 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi138
%34 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi139
%index.next = add i32 %index, 4
%35 = icmp eq i32 %index.next, %n.vec
br i1 %35, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%36 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %34)
%37 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %33)
%38 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %32)
%39 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %31)
%40 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %30)
%arrayidx42 = getelementptr inbounds float, float* %pOut, i32 %k2.0135
store float %38, float* %arrayidx42, align 4
%arrayidx44 = getelementptr inbounds float, float* %pOut, i32 %add
store float %36, float* %arrayidx44, align 4
%arrayidx46 = getelementptr inbounds float, float* %pOut, i32 %add6
store float %37, float* %arrayidx46, align 4
%arrayidx48 = getelementptr inbounds float, float* %pOut, i32 %add8
store float %39, float* %arrayidx48, align 4
%arrayidx50 = getelementptr inbounds float, float* %pOut, i32 %add10
store float %40, float* %arrayidx50, align 4
%add52 = add i32 %k2.0135, 5
%cmp3 = icmp ult i32 %add52, %sub
br i1 %cmp3, label %for.body, label %for.cond.cleanup
}
define void @DCT_mve6(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
; CHECK-LABEL: DCT_mve6:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: .pad #32
; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
; CHECK-NEXT: subs r1, #6
; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo.w .LBB5_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: ldr r3, [r0, #8]
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: adds r0, r3, #3
; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: bic r0, r0, #3
; CHECK-NEXT: add.w r8, r1, r3, lsl #2
; CHECK-NEXT: subs r1, r0, #4
; CHECK-NEXT: movs r0, #1
; CHECK-NEXT: lsls r5, r3, #2
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: add.w r1, r3, r3, lsl #1
; CHECK-NEXT: lsls r1, r1, #3
; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB5_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB5_3 Depth 2
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: adds r1, r0, #5
; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
; CHECK-NEXT: adds r1, r0, #4
; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
; CHECK-NEXT: adds r1, r0, #3
; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: add.w r11, r0, #2
; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: adds r4, r0, #1
; CHECK-NEXT: mov r3, r8
; CHECK-NEXT: vmov q3, q1
; CHECK-NEXT: vmov q4, q1
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: vmov q5, q1
; CHECK-NEXT: vmov q2, q1
; CHECK-NEXT: dlstp.32 lr, r7
; CHECK-NEXT: .LBB5_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: add.w r12, r3, r5
; CHECK-NEXT: vldrw.u32 q6, [r1], #16
; CHECK-NEXT: vldrw.u32 q7, [r3], #16
; CHECK-NEXT: vfma.f32 q4, q7, q6
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: add.w r10, r12, r5
; CHECK-NEXT: vldrw.u32 q7, [r12]
; CHECK-NEXT: vfma.f32 q5, q7, q6
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: add.w r6, r10, r5
; CHECK-NEXT: vldrw.u32 q7, [r10]
; CHECK-NEXT: vfma.f32 q2, q7, q6
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: adds r7, r6, r5
; CHECK-NEXT: vldrw.u32 q7, [r6]
; CHECK-NEXT: vfma.f32 q0, q7, q6
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: adds r6, r7, r5
; CHECK-NEXT: vldrw.u32 q7, [r7]
; CHECK-NEXT: vfma.f32 q3, q7, q6
; CHECK-NEXT: vldrw.u32 q7, [r6]
; CHECK-NEXT: vfma.f32 q1, q7, q6
; CHECK-NEXT: letp lr, .LBB5_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB5_2 Depth=1
; CHECK-NEXT: vadd.f32 s24, s22, s23
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: add.w r1, r2, r4, lsl #2
; CHECK-NEXT: vadd.f32 s20, s20, s21
; CHECK-NEXT: vadd.f32 s22, s18, s19
; CHECK-NEXT: vadd.f32 s16, s16, s17
; CHECK-NEXT: vadd.f32 s18, s6, s7
; CHECK-NEXT: vadd.f32 s4, s4, s5
; CHECK-NEXT: vadd.f32 s6, s14, s15
; CHECK-NEXT: vadd.f32 s12, s12, s13
; CHECK-NEXT: vadd.f32 s14, s10, s11
; CHECK-NEXT: vadd.f32 s8, s8, s9
; CHECK-NEXT: vadd.f32 s0, s0, s1
; CHECK-NEXT: vadd.f32 s10, s2, s3
; CHECK-NEXT: vadd.f32 s2, s20, s24
; CHECK-NEXT: vadd.f32 s1, s16, s22
; CHECK-NEXT: vadd.f32 s6, s12, s6
; CHECK-NEXT: vadd.f32 s4, s4, s18
; CHECK-NEXT: vadd.f32 s8, s8, s14
; CHECK-NEXT: vadd.f32 s0, s0, s10
; CHECK-NEXT: vstr s2, [r1]
; CHECK-NEXT: add.w r1, r2, r0, lsl #2
; CHECK-NEXT: adds r0, #6
; CHECK-NEXT: vstr s1, [r1]
; CHECK-NEXT: add.w r1, r2, r11, lsl #2
; CHECK-NEXT: vstr s8, [r1]
; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s0, [r1]
; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s6, [r1]
; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s4, [r1]
; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
; CHECK-NEXT: add r8, r1
; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: blo.w .LBB5_2
; CHECK-NEXT: .LBB5_5: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #32
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
%0 = load i32, i32* %NumInputs, align 4
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
%1 = load i32, i32* %NumFilters, align 4
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
%2 = load float*, float** %pDCTCoefs, align 4
%cmp = icmp ugt i32 %0, 1
tail call void @llvm.assume(i1 %cmp)
%sub = add i32 %1, -6
%cmp3155 = icmp ugt i32 %sub, 1
br i1 %cmp3155, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%n.rnd.up = add i32 %0, 3
%n.vec = and i32 %n.rnd.up, -4
br label %for.body
for.cond.cleanup: ; preds = %middle.block, %entry
ret void
for.body: ; preds = %for.body.preheader, %middle.block
%k2.0156 = phi i32 [ %add61, %middle.block ], [ 1, %for.body.preheader ]
%mul4 = mul i32 %k2.0156, %0
%add = add nuw i32 %k2.0156, 1
%mul5 = mul i32 %add, %0
%add6 = add i32 %k2.0156, 2
%mul7 = mul i32 %add6, %0
%add8 = add i32 %k2.0156, 3
%mul9 = mul i32 %add8, %0
%add10 = add i32 %k2.0156, 4
%mul11 = mul i32 %add10, %0
%add12 = add i32 %k2.0156, 5
%mul13 = mul i32 %add12, %0
br label %vector.body
vector.body: ; preds = %vector.body, %for.body
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %35, %vector.body ]
%vec.phi157 = phi <4 x float> [ zeroinitializer, %for.body ], [ %36, %vector.body ]
%vec.phi158 = phi <4 x float> [ zeroinitializer, %for.body ], [ %37, %vector.body ]
%vec.phi159 = phi <4 x float> [ zeroinitializer, %for.body ], [ %38, %vector.body ]
%vec.phi160 = phi <4 x float> [ zeroinitializer, %for.body ], [ %39, %vector.body ]
%vec.phi161 = phi <4 x float> [ zeroinitializer, %for.body ], [ %40, %vector.body ]
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
%3 = getelementptr inbounds float, float* %pIn, i32 %index
%4 = bitcast float* %3 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%5 = add i32 %index, %mul4
%6 = getelementptr inbounds float, float* %2, i32 %5
%7 = bitcast float* %6 to <4 x float>*
%wide.masked.load162 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%8 = fmul fast <4 x float> %wide.masked.load162, %wide.masked.load
%9 = fadd fast <4 x float> %8, %vec.phi158
%10 = add i32 %index, %mul5
%11 = getelementptr inbounds float, float* %2, i32 %10
%12 = bitcast float* %11 to <4 x float>*
%wide.masked.load163 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%13 = fmul fast <4 x float> %wide.masked.load163, %wide.masked.load
%14 = fadd fast <4 x float> %13, %vec.phi160
%15 = add i32 %index, %mul7
%16 = getelementptr inbounds float, float* %2, i32 %15
%17 = bitcast float* %16 to <4 x float>*
%wide.masked.load164 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%18 = fmul fast <4 x float> %wide.masked.load164, %wide.masked.load
%19 = fadd fast <4 x float> %18, %vec.phi161
%20 = add i32 %index, %mul9
%21 = getelementptr inbounds float, float* %2, i32 %20
%22 = bitcast float* %21 to <4 x float>*
%wide.masked.load165 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%23 = fmul fast <4 x float> %wide.masked.load165, %wide.masked.load
%24 = fadd fast <4 x float> %23, %vec.phi159
%25 = add i32 %index, %mul11
%26 = getelementptr inbounds float, float* %2, i32 %25
%27 = bitcast float* %26 to <4 x float>*
%wide.masked.load166 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%28 = fmul fast <4 x float> %wide.masked.load166, %wide.masked.load
%29 = fadd fast <4 x float> %28, %vec.phi157
%30 = add i32 %index, %mul13
%31 = getelementptr inbounds float, float* %2, i32 %30
%32 = bitcast float* %31 to <4 x float>*
%wide.masked.load167 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%33 = fmul fast <4 x float> %wide.masked.load167, %wide.masked.load
%34 = fadd fast <4 x float> %33, %vec.phi
%35 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi
%36 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi157
%37 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi158
%38 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi159
%39 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi160
%40 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi161
%index.next = add i32 %index, 4
%41 = icmp eq i32 %index.next, %n.vec
br i1 %41, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%42 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %40)
%43 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %39)
%44 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %38)
%45 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %37)
%46 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %36)
%47 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %35)
%arrayidx49 = getelementptr inbounds float, float* %pOut, i32 %k2.0156
store float %45, float* %arrayidx49, align 4
%arrayidx51 = getelementptr inbounds float, float* %pOut, i32 %add
store float %43, float* %arrayidx51, align 4
%arrayidx53 = getelementptr inbounds float, float* %pOut, i32 %add6
store float %42, float* %arrayidx53, align 4
%arrayidx55 = getelementptr inbounds float, float* %pOut, i32 %add8
store float %44, float* %arrayidx55, align 4
%arrayidx57 = getelementptr inbounds float, float* %pOut, i32 %add10
store float %46, float* %arrayidx57, align 4
%arrayidx59 = getelementptr inbounds float, float* %pOut, i32 %add12
store float %47, float* %arrayidx59, align 4
%add61 = add i32 %k2.0156, 6
%cmp3 = icmp ult i32 %add61, %sub
br i1 %cmp3, label %for.body, label %for.cond.cleanup
}
define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
; CHECK-LABEL: DCT_mve7:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: .pad #88
; CHECK-NEXT: sub sp, #88
; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
; CHECK-NEXT: subs r1, #7
; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo.w .LBB6_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: ldr r3, [r0, #8]
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: adds r0, r3, #3
; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: bic r0, r0, #3
; CHECK-NEXT: add.w r9, r1, r3, lsl #2
; CHECK-NEXT: subs r1, r0, #4
; CHECK-NEXT: movs r0, #1
; CHECK-NEXT: lsls r5, r3, #2
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: rsb r1, r3, r3, lsl #3
; CHECK-NEXT: lsls r1, r1, #2
; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: .LBB6_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB6_3 Depth 2
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: adds r1, r0, #6
; CHECK-NEXT: str r1, [sp, #44] @ 4-byte Spill
; CHECK-NEXT: adds r1, r0, #5
; CHECK-NEXT: str r1, [sp, #40] @ 4-byte Spill
; CHECK-NEXT: adds r1, r0, #4
; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill
; CHECK-NEXT: adds r1, r0, #3
; CHECK-NEXT: ldr r7, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill
; CHECK-NEXT: vmov.i32 q2, #0x0
; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
; CHECK-NEXT: adds r4, r0, #2
; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: add.w r8, r0, #1
; CHECK-NEXT: mov r3, r9
; CHECK-NEXT: vmov q4, q2
; CHECK-NEXT: vmov q5, q2
; CHECK-NEXT: vmov q3, q2
; CHECK-NEXT: vmov q6, q2
; CHECK-NEXT: vmov q1, q2
; CHECK-NEXT: mov r12, r7
; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill
; CHECK-NEXT: dls lr, r6
; CHECK-NEXT: .LBB6_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: add.w r10, r3, r5
; CHECK-NEXT: vctp.32 r12
; CHECK-NEXT: vpsttt
; CHECK-NEXT: vldrwt.u32 q7, [r1], #16
; CHECK-NEXT: vldrwt.u32 q0, [r3], #16
; CHECK-NEXT: vfmat.f32 q5, q0, q7
; CHECK-NEXT: add.w r11, r10, r5
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q0, [r10]
; CHECK-NEXT: vfmat.f32 q6, q0, q7
; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q0, [r11]
; CHECK-NEXT: vfmat.f32 q1, q0, q7
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: add.w r6, r11, r5
; CHECK-NEXT: vmov q6, q5
; CHECK-NEXT: vmov q5, q4
; CHECK-NEXT: vmov q4, q2
; CHECK-NEXT: vmov q2, q3
; CHECK-NEXT: vpst
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vldrwt.u32 q0, [r6]
; CHECK-NEXT: vmov q3, q1
; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: adds r7, r6, r5
; CHECK-NEXT: vpst
; CHECK-NEXT: vfmat.f32 q1, q0, q7
; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill
; CHECK-NEXT: vmov q1, q3
; CHECK-NEXT: vmov q3, q2
; CHECK-NEXT: vmov q2, q4
; CHECK-NEXT: vmov q4, q5
; CHECK-NEXT: vmov q5, q6
; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload
; CHECK-NEXT: sub.w r12, r12, #4
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: adds r6, r7, r5
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q0, [r7]
; CHECK-NEXT: vfmat.f32 q3, q0, q7
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: adds r7, r6, r5
; CHECK-NEXT: vpstttt
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vldrwt.u32 q0, [r6]
; CHECK-NEXT: vfmat.f32 q4, q0, q7
; CHECK-NEXT: vldrwt.u32 q0, [r7]
; CHECK-NEXT: vfmat.f32 q2, q0, q7
; CHECK-NEXT: le lr, .LBB6_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB6_2 Depth=1
; CHECK-NEXT: vadd.f32 s0, s26, s27
; CHECK-NEXT: add.w r1, r2, r8, lsl #2
; CHECK-NEXT: vadd.f32 s2, s24, s25
; CHECK-NEXT: vadd.f32 s3, s20, s21
; CHECK-NEXT: vadd.f32 s1, s22, s23
; CHECK-NEXT: vadd.f32 s8, s8, s9
; CHECK-NEXT: vadd.f32 s20, s10, s11
; CHECK-NEXT: vadd.f32 s11, s14, s15
; CHECK-NEXT: vadd.f32 s12, s12, s13
; CHECK-NEXT: vadd.f32 s14, s6, s7
; CHECK-NEXT: vadd.f32 s4, s4, s5
; CHECK-NEXT: vadd.f32 s0, s2, s0
; CHECK-NEXT: vadd.f32 s10, s18, s19
; CHECK-NEXT: vadd.f32 s9, s16, s17
; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload
; CHECK-NEXT: vadd.f32 s2, s3, s1
; CHECK-NEXT: vadd.f32 s6, s18, s19
; CHECK-NEXT: vadd.f32 s5, s16, s17
; CHECK-NEXT: vadd.f32 s4, s4, s14
; CHECK-NEXT: vstr s0, [r1]
; CHECK-NEXT: add.w r1, r2, r0, lsl #2
; CHECK-NEXT: vadd.f32 s12, s12, s11
; CHECK-NEXT: adds r0, #7
; CHECK-NEXT: vadd.f32 s10, s9, s10
; CHECK-NEXT: vstr s2, [r1]
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: add.w r1, r2, r4, lsl #2
; CHECK-NEXT: vadd.f32 s8, s8, s20
; CHECK-NEXT: vadd.f32 s6, s5, s6
; CHECK-NEXT: vstr s4, [r1]
; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s6, [r1]
; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s12, [r1]
; CHECK-NEXT: ldr r1, [sp, #40] @ 4-byte Reload
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s10, [r1]
; CHECK-NEXT: ldr r1, [sp, #44] @ 4-byte Reload
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s8, [r1]
; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: add r9, r1
; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: blo.w .LBB6_2
; CHECK-NEXT: .LBB6_5: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #88
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
%0 = load i32, i32* %NumInputs, align 4
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
%1 = load i32, i32* %NumFilters, align 4
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
%2 = load float*, float** %pDCTCoefs, align 4
%cmp = icmp ugt i32 %0, 1
tail call void @llvm.assume(i1 %cmp)
%sub = add i32 %1, -7
%cmp3176 = icmp ugt i32 %sub, 1
br i1 %cmp3176, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%n.rnd.up = add i32 %0, 3
%n.vec = and i32 %n.rnd.up, -4
br label %for.body
for.cond.cleanup: ; preds = %middle.block, %entry
ret void
for.body: ; preds = %for.body.preheader, %middle.block
%k2.0177 = phi i32 [ %add70, %middle.block ], [ 1, %for.body.preheader ]
%mul4 = mul i32 %k2.0177, %0
%add = add nuw i32 %k2.0177, 1
%mul5 = mul i32 %add, %0
%add6 = add i32 %k2.0177, 2
%mul7 = mul i32 %add6, %0
%add8 = add i32 %k2.0177, 3
%mul9 = mul i32 %add8, %0
%add10 = add i32 %k2.0177, 4
%mul11 = mul i32 %add10, %0
%add12 = add i32 %k2.0177, 5
%mul13 = mul i32 %add12, %0
%add14 = add i32 %k2.0177, 6
%mul15 = mul i32 %add14, %0
br label %vector.body
vector.body: ; preds = %vector.body, %for.body
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %40, %vector.body ]
%vec.phi178 = phi <4 x float> [ zeroinitializer, %for.body ], [ %41, %vector.body ]
%vec.phi179 = phi <4 x float> [ zeroinitializer, %for.body ], [ %42, %vector.body ]
%vec.phi180 = phi <4 x float> [ zeroinitializer, %for.body ], [ %43, %vector.body ]
%vec.phi181 = phi <4 x float> [ zeroinitializer, %for.body ], [ %44, %vector.body ]
%vec.phi182 = phi <4 x float> [ zeroinitializer, %for.body ], [ %45, %vector.body ]
%vec.phi183 = phi <4 x float> [ zeroinitializer, %for.body ], [ %46, %vector.body ]
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
%3 = getelementptr inbounds float, float* %pIn, i32 %index
%4 = bitcast float* %3 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%5 = add i32 %index, %mul4
%6 = getelementptr inbounds float, float* %2, i32 %5
%7 = bitcast float* %6 to <4 x float>*
%wide.masked.load184 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%8 = fmul fast <4 x float> %wide.masked.load184, %wide.masked.load
%9 = fadd fast <4 x float> %8, %vec.phi179
%10 = add i32 %index, %mul5
%11 = getelementptr inbounds float, float* %2, i32 %10
%12 = bitcast float* %11 to <4 x float>*
%wide.masked.load185 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%13 = fmul fast <4 x float> %wide.masked.load185, %wide.masked.load
%14 = fadd fast <4 x float> %13, %vec.phi181
%15 = add i32 %index, %mul7
%16 = getelementptr inbounds float, float* %2, i32 %15
%17 = bitcast float* %16 to <4 x float>*
%wide.masked.load186 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%18 = fmul fast <4 x float> %wide.masked.load186, %wide.masked.load
%19 = fadd fast <4 x float> %18, %vec.phi183
%20 = add i32 %index, %mul9
%21 = getelementptr inbounds float, float* %2, i32 %20
%22 = bitcast float* %21 to <4 x float>*
%wide.masked.load187 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%23 = fmul fast <4 x float> %wide.masked.load187, %wide.masked.load
%24 = fadd fast <4 x float> %23, %vec.phi182
%25 = add i32 %index, %mul11
%26 = getelementptr inbounds float, float* %2, i32 %25
%27 = bitcast float* %26 to <4 x float>*
%wide.masked.load188 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%28 = fmul fast <4 x float> %wide.masked.load188, %wide.masked.load
%29 = fadd fast <4 x float> %28, %vec.phi180
%30 = add i32 %index, %mul13
%31 = getelementptr inbounds float, float* %2, i32 %30
%32 = bitcast float* %31 to <4 x float>*
%wide.masked.load189 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%33 = fmul fast <4 x float> %wide.masked.load189, %wide.masked.load
%34 = fadd fast <4 x float> %33, %vec.phi178
%35 = add i32 %index, %mul15
%36 = getelementptr inbounds float, float* %2, i32 %35
%37 = bitcast float* %36 to <4 x float>*
%wide.masked.load190 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %37, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%38 = fmul fast <4 x float> %wide.masked.load190, %wide.masked.load
%39 = fadd fast <4 x float> %38, %vec.phi
%40 = select <4 x i1> %active.lane.mask, <4 x float> %39, <4 x float> %vec.phi
%41 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi178
%42 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi179
%43 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi180
%44 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi181
%45 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi182
%46 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi183
%index.next = add i32 %index, 4
%47 = icmp eq i32 %index.next, %n.vec
br i1 %47, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%48 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %46)
%49 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %45)
%50 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %44)
%51 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %43)
%52 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %42)
%53 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %41)
%54 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %40)
%arrayidx56 = getelementptr inbounds float, float* %pOut, i32 %k2.0177
store float %52, float* %arrayidx56, align 4
%arrayidx58 = getelementptr inbounds float, float* %pOut, i32 %add
store float %50, float* %arrayidx58, align 4
%arrayidx60 = getelementptr inbounds float, float* %pOut, i32 %add6
store float %48, float* %arrayidx60, align 4
%arrayidx62 = getelementptr inbounds float, float* %pOut, i32 %add8
store float %49, float* %arrayidx62, align 4
%arrayidx64 = getelementptr inbounds float, float* %pOut, i32 %add10
store float %51, float* %arrayidx64, align 4
%arrayidx66 = getelementptr inbounds float, float* %pOut, i32 %add12
store float %53, float* %arrayidx66, align 4
%arrayidx68 = getelementptr inbounds float, float* %pOut, i32 %add14
store float %54, float* %arrayidx68, align 4
%add70 = add i32 %k2.0177, 7
%cmp3 = icmp ult i32 %add70, %sub
br i1 %cmp3, label %for.body, label %for.cond.cleanup
}
define void @DCT_mve8(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
; CHECK-LABEL: DCT_mve8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: .pad #104
; CHECK-NEXT: sub sp, #104
; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
; CHECK-NEXT: subs r1, #8
; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo.w .LBB7_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: ldr r3, [r0, #8]
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: adds r0, r3, #3
; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: bic r0, r0, #3
; CHECK-NEXT: add.w r12, r1, r3, lsl #2
; CHECK-NEXT: subs r1, r0, #4
; CHECK-NEXT: movs r0, #1
; CHECK-NEXT: lsls r5, r3, #2
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: lsls r1, r3, #5
; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: .LBB7_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB7_3 Depth 2
; CHECK-NEXT: adds r1, r0, #7
; CHECK-NEXT: str r1, [sp, #44] @ 4-byte Spill
; CHECK-NEXT: adds r1, r0, #6
; CHECK-NEXT: str r1, [sp, #40] @ 4-byte Spill
; CHECK-NEXT: adds r1, r0, #5
; CHECK-NEXT: ldr r7, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill
; CHECK-NEXT: adds r1, r0, #4
; CHECK-NEXT: ldr.w r9, [sp, #28] @ 4-byte Reload
; CHECK-NEXT: vmov.i32 q3, #0x0
; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: adds r4, r0, #3
; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill
; CHECK-NEXT: add.w r8, r0, #2
; CHECK-NEXT: adds r1, r0, #1
; CHECK-NEXT: mov r3, r12
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vmov q5, q3
; CHECK-NEXT: vmov q6, q3
; CHECK-NEXT: vmov q4, q3
; CHECK-NEXT: vmov q7, q3
; CHECK-NEXT: vmov q2, q3
; CHECK-NEXT: mov r10, r7
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill
; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill
; CHECK-NEXT: dls lr, r6
; CHECK-NEXT: .LBB7_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: add.w r11, r3, r5
; CHECK-NEXT: vctp.32 r10
; CHECK-NEXT: vpsttt
; CHECK-NEXT: vldrwt.u32 q0, [r9], #16
; CHECK-NEXT: vldrwt.u32 q1, [r3], #16
; CHECK-NEXT: vfmat.f32 q6, q1, q0
[ARM][RegAlloc] Add t2LoopEndDec We currently have problems with the way that low overhead loops are specified, with LR being spilled between the t2LoopDec and the t2LoopEnd forcing the entire loop to be reverted late in the backend. As they will eventually become a single instruction, this patch introduces a t2LoopEndDec which is the combination of the two, combined before registry allocation to make sure this does not fail. Unfortunately this instruction is a terminator that produces a value (and also branches - it only produces the value around the branching edge). So this needs some adjustment to phi elimination and the register allocator to make sure that we do not spill this LR def around the loop (needing to put a spill after the terminator). We treat the loop very carefully, making sure that there is nothing else like calls that would break it's ability to use LR. For that, this adds a isUnspillableTerminator to opt in the new behaviour. There is a chance that this could cause problems, and so I have added an escape option incase. But I have not seen any problems in the testing that I've tried, and not reverting Low overhead loops is important for our performance. If this does work then we can hopefully do the same for t2WhileLoopStart and t2DoLoopStart instructions. This patch also contains the code needed to convert or revert the t2LoopEndDec in the backend (which just needs a subs; bne) and the code pre-ra to create them. Differential Revision: https://reviews.llvm.org/D91358
2020-12-10 12:14:23 +00:00
; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q1, [r11]
; CHECK-NEXT: vfmat.f32 q7, q1, q0
[ARM][RegAlloc] Add t2LoopEndDec We currently have problems with the way that low overhead loops are specified, with LR being spilled between the t2LoopDec and the t2LoopEnd forcing the entire loop to be reverted late in the backend. As they will eventually become a single instruction, this patch introduces a t2LoopEndDec which is the combination of the two, combined before registry allocation to make sure this does not fail. Unfortunately this instruction is a terminator that produces a value (and also branches - it only produces the value around the branching edge). So this needs some adjustment to phi elimination and the register allocator to make sure that we do not spill this LR def around the loop (needing to put a spill after the terminator). We treat the loop very carefully, making sure that there is nothing else like calls that would break it's ability to use LR. For that, this adds a isUnspillableTerminator to opt in the new behaviour. There is a chance that this could cause problems, and so I have added an escape option incase. But I have not seen any problems in the testing that I've tried, and not reverting Low overhead loops is important for our performance. If this does work then we can hopefully do the same for t2WhileLoopStart and t2DoLoopStart instructions. This patch also contains the code needed to convert or revert the t2LoopEndDec in the backend (which just needs a subs; bne) and the code pre-ra to create them. Differential Revision: https://reviews.llvm.org/D91358
2020-12-10 12:14:23 +00:00
; CHECK-NEXT: add.w r6, r11, r5
; CHECK-NEXT: vmov q6, q5
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vmov q5, q3
; CHECK-NEXT: vmov q3, q4
; CHECK-NEXT: vpst
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vldrwt.u32 q1, [r6]
; CHECK-NEXT: vmov q4, q2
; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload
; CHECK-NEXT: vpst
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vfmat.f32 q2, q1, q0
; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill
[ARM][RegAlloc] Add t2LoopEndDec We currently have problems with the way that low overhead loops are specified, with LR being spilled between the t2LoopDec and the t2LoopEnd forcing the entire loop to be reverted late in the backend. As they will eventually become a single instruction, this patch introduces a t2LoopEndDec which is the combination of the two, combined before registry allocation to make sure this does not fail. Unfortunately this instruction is a terminator that produces a value (and also branches - it only produces the value around the branching edge). So this needs some adjustment to phi elimination and the register allocator to make sure that we do not spill this LR def around the loop (needing to put a spill after the terminator). We treat the loop very carefully, making sure that there is nothing else like calls that would break it's ability to use LR. For that, this adds a isUnspillableTerminator to opt in the new behaviour. There is a chance that this could cause problems, and so I have added an escape option incase. But I have not seen any problems in the testing that I've tried, and not reverting Low overhead loops is important for our performance. If this does work then we can hopefully do the same for t2WhileLoopStart and t2DoLoopStart instructions. This patch also contains the code needed to convert or revert the t2LoopEndDec in the backend (which just needs a subs; bne) and the code pre-ra to create them. Differential Revision: https://reviews.llvm.org/D91358
2020-12-10 12:14:23 +00:00
; CHECK-NEXT: adds r7, r6, r5
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrwt.u32 q1, [r7]
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload
[ARM][RegAlloc] Add t2LoopEndDec We currently have problems with the way that low overhead loops are specified, with LR being spilled between the t2LoopDec and the t2LoopEnd forcing the entire loop to be reverted late in the backend. As they will eventually become a single instruction, this patch introduces a t2LoopEndDec which is the combination of the two, combined before registry allocation to make sure this does not fail. Unfortunately this instruction is a terminator that produces a value (and also branches - it only produces the value around the branching edge). So this needs some adjustment to phi elimination and the register allocator to make sure that we do not spill this LR def around the loop (needing to put a spill after the terminator). We treat the loop very carefully, making sure that there is nothing else like calls that would break it's ability to use LR. For that, this adds a isUnspillableTerminator to opt in the new behaviour. There is a chance that this could cause problems, and so I have added an escape option incase. But I have not seen any problems in the testing that I've tried, and not reverting Low overhead loops is important for our performance. If this does work then we can hopefully do the same for t2WhileLoopStart and t2DoLoopStart instructions. This patch also contains the code needed to convert or revert the t2LoopEndDec in the backend (which just needs a subs; bne) and the code pre-ra to create them. Differential Revision: https://reviews.llvm.org/D91358
2020-12-10 12:14:23 +00:00
; CHECK-NEXT: adds r6, r7, r5
; CHECK-NEXT: vpst
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vfmat.f32 q2, q1, q0
; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill
; CHECK-NEXT: vmov q2, q4
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vmov q4, q3
; CHECK-NEXT: vmov q3, q5
; CHECK-NEXT: vmov q5, q6
[ARM][RegAlloc] Add t2LoopEndDec We currently have problems with the way that low overhead loops are specified, with LR being spilled between the t2LoopDec and the t2LoopEnd forcing the entire loop to be reverted late in the backend. As they will eventually become a single instruction, this patch introduces a t2LoopEndDec which is the combination of the two, combined before registry allocation to make sure this does not fail. Unfortunately this instruction is a terminator that produces a value (and also branches - it only produces the value around the branching edge). So this needs some adjustment to phi elimination and the register allocator to make sure that we do not spill this LR def around the loop (needing to put a spill after the terminator). We treat the loop very carefully, making sure that there is nothing else like calls that would break it's ability to use LR. For that, this adds a isUnspillableTerminator to opt in the new behaviour. There is a chance that this could cause problems, and so I have added an escape option incase. But I have not seen any problems in the testing that I've tried, and not reverting Low overhead loops is important for our performance. If this does work then we can hopefully do the same for t2WhileLoopStart and t2DoLoopStart instructions. This patch also contains the code needed to convert or revert the t2LoopEndDec in the backend (which just needs a subs; bne) and the code pre-ra to create them. Differential Revision: https://reviews.llvm.org/D91358
2020-12-10 12:14:23 +00:00
; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: adds r7, r6, r5
; CHECK-NEXT: vpstt
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vldrwt.u32 q1, [r6]
; CHECK-NEXT: vfmat.f32 q2, q1, q0
; CHECK-NEXT: sub.w r10, r10, #4
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: adds r6, r7, r5
; CHECK-NEXT: vpstttt
; CHECK-NEXT: vldrwt.u32 q1, [r7]
; CHECK-NEXT: vfmat.f32 q4, q1, q0
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vldrwt.u32 q1, [r6]
; CHECK-NEXT: vfmat.f32 q5, q1, q0
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: add r6, r5
; CHECK-NEXT: vpstt
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vldrwt.u32 q1, [r6]
; CHECK-NEXT: vfmat.f32 q3, q1, q0
; CHECK-NEXT: le lr, .LBB7_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB7_2 Depth=1
; CHECK-NEXT: vadd.f32 s0, s30, s31
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vadd.f32 s2, s28, s29
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vadd.f32 s12, s12, s13
; CHECK-NEXT: vadd.f32 s5, s14, s15
; CHECK-NEXT: vadd.f32 s4, s26, s27
; CHECK-NEXT: vadd.f32 s6, s24, s25
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vadd.f32 s14, s18, s19
; CHECK-NEXT: vadd.f32 s7, s16, s17
; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vadd.f32 s8, s8, s9
; CHECK-NEXT: vadd.f32 s13, s10, s11
; CHECK-NEXT: vadd.f32 s10, s18, s19
; CHECK-NEXT: vadd.f32 s9, s16, s17
; CHECK-NEXT: vldrw.u32 q4, [sp, #80] @ 16-byte Reload
; CHECK-NEXT: vadd.f32 s0, s2, s0
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vadd.f32 s11, s18, s19
; CHECK-NEXT: vadd.f32 s15, s16, s17
; CHECK-NEXT: vadd.f32 s2, s6, s4
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vadd.f32 s6, s12, s5
; CHECK-NEXT: vadd.f32 s12, s7, s14
; CHECK-NEXT: vadd.f32 s10, s9, s10
; CHECK-NEXT: vstr s0, [r1]
; CHECK-NEXT: add.w r1, r2, r0, lsl #2
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vadd.f32 s8, s8, s13
; CHECK-NEXT: adds r0, #8
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vadd.f32 s14, s15, s11
; CHECK-NEXT: vstr s2, [r1]
; CHECK-NEXT: add.w r1, r2, r8, lsl #2
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vadd.f32 s1, s22, s23
; CHECK-NEXT: vadd.f32 s3, s20, s21
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vstr s10, [r1]
; CHECK-NEXT: add.w r1, r2, r4, lsl #2
; CHECK-NEXT: vstr s14, [r1]
; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload
; CHECK-NEXT: vadd.f32 s4, s3, s1
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vstr s8, [r1]
; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 15:57:58 +00:00
; CHECK-NEXT: vstr s12, [r1]
; CHECK-NEXT: ldr r1, [sp, #40] @ 4-byte Reload
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s4, [r1]
; CHECK-NEXT: ldr r1, [sp, #44] @ 4-byte Reload
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s6, [r1]
; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: add r12, r1
; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: blo.w .LBB7_2
; CHECK-NEXT: .LBB7_5: @ %for.cond.cleanup
; CHECK-NEXT: add sp, #104
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
%0 = load i32, i32* %NumInputs, align 4
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
%1 = load i32, i32* %NumFilters, align 4
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
%2 = load float*, float** %pDCTCoefs, align 4
%cmp = icmp ugt i32 %0, 1
tail call void @llvm.assume(i1 %cmp)
%sub = add i32 %1, -8
%cmp3197 = icmp ugt i32 %sub, 1
br i1 %cmp3197, label %for.body.preheader, label %for.cond.cleanup
for.body.preheader: ; preds = %entry
%n.rnd.up = add i32 %0, 3
%n.vec = and i32 %n.rnd.up, -4
br label %for.body
for.cond.cleanup: ; preds = %middle.block, %entry
ret void
for.body: ; preds = %for.body.preheader, %middle.block
%k2.0198 = phi i32 [ %add79, %middle.block ], [ 1, %for.body.preheader ]
%mul4 = mul i32 %k2.0198, %0
%add = add nuw nsw i32 %k2.0198, 1
%mul5 = mul i32 %add, %0
%add6 = add nuw nsw i32 %k2.0198, 2
%mul7 = mul i32 %add6, %0
%add8 = add nuw nsw i32 %k2.0198, 3
%mul9 = mul i32 %add8, %0
%add10 = add nuw nsw i32 %k2.0198, 4
%mul11 = mul i32 %add10, %0
%add12 = add nuw nsw i32 %k2.0198, 5
%mul13 = mul i32 %add12, %0
%add14 = add nuw nsw i32 %k2.0198, 6
%mul15 = mul i32 %add14, %0
%add16 = add i32 %k2.0198, 7
%mul17 = mul i32 %add16, %0
br label %vector.body
vector.body: ; preds = %vector.body, %for.body
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %45, %vector.body ]
%vec.phi199 = phi <4 x float> [ zeroinitializer, %for.body ], [ %46, %vector.body ]
%vec.phi200 = phi <4 x float> [ zeroinitializer, %for.body ], [ %47, %vector.body ]
%vec.phi201 = phi <4 x float> [ zeroinitializer, %for.body ], [ %48, %vector.body ]
%vec.phi202 = phi <4 x float> [ zeroinitializer, %for.body ], [ %49, %vector.body ]
%vec.phi203 = phi <4 x float> [ zeroinitializer, %for.body ], [ %50, %vector.body ]
%vec.phi204 = phi <4 x float> [ zeroinitializer, %for.body ], [ %51, %vector.body ]
%vec.phi205 = phi <4 x float> [ zeroinitializer, %for.body ], [ %52, %vector.body ]
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
%3 = getelementptr inbounds float, float* %pIn, i32 %index
%4 = bitcast float* %3 to <4 x float>*
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%5 = add i32 %index, %mul4
%6 = getelementptr inbounds float, float* %2, i32 %5
%7 = bitcast float* %6 to <4 x float>*
%wide.masked.load206 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%8 = fmul fast <4 x float> %wide.masked.load206, %wide.masked.load
%9 = fadd fast <4 x float> %8, %vec.phi200
%10 = add i32 %index, %mul5
%11 = getelementptr inbounds float, float* %2, i32 %10
%12 = bitcast float* %11 to <4 x float>*
%wide.masked.load207 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%13 = fmul fast <4 x float> %wide.masked.load207, %wide.masked.load
%14 = fadd fast <4 x float> %13, %vec.phi202
%15 = add i32 %index, %mul7
%16 = getelementptr inbounds float, float* %2, i32 %15
%17 = bitcast float* %16 to <4 x float>*
%wide.masked.load208 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%18 = fmul fast <4 x float> %wide.masked.load208, %wide.masked.load
%19 = fadd fast <4 x float> %18, %vec.phi204
%20 = add i32 %index, %mul9
%21 = getelementptr inbounds float, float* %2, i32 %20
%22 = bitcast float* %21 to <4 x float>*
%wide.masked.load209 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%23 = fmul fast <4 x float> %wide.masked.load209, %wide.masked.load
%24 = fadd fast <4 x float> %23, %vec.phi205
%25 = add i32 %index, %mul11
%26 = getelementptr inbounds float, float* %2, i32 %25
%27 = bitcast float* %26 to <4 x float>*
%wide.masked.load210 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%28 = fmul fast <4 x float> %wide.masked.load210, %wide.masked.load
%29 = fadd fast <4 x float> %28, %vec.phi203
%30 = add i32 %index, %mul13
%31 = getelementptr inbounds float, float* %2, i32 %30
%32 = bitcast float* %31 to <4 x float>*
%wide.masked.load211 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%33 = fmul fast <4 x float> %wide.masked.load211, %wide.masked.load
%34 = fadd fast <4 x float> %33, %vec.phi201
%35 = add i32 %index, %mul15
%36 = getelementptr inbounds float, float* %2, i32 %35
%37 = bitcast float* %36 to <4 x float>*
%wide.masked.load212 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %37, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%38 = fmul fast <4 x float> %wide.masked.load212, %wide.masked.load
%39 = fadd fast <4 x float> %38, %vec.phi199
%40 = add i32 %index, %mul17
%41 = getelementptr inbounds float, float* %2, i32 %40
%42 = bitcast float* %41 to <4 x float>*
%wide.masked.load213 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %42, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
%43 = fmul fast <4 x float> %wide.masked.load213, %wide.masked.load
%44 = fadd fast <4 x float> %43, %vec.phi
%45 = select <4 x i1> %active.lane.mask, <4 x float> %44, <4 x float> %vec.phi
%46 = select <4 x i1> %active.lane.mask, <4 x float> %39, <4 x float> %vec.phi199
%47 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi200
%48 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi201
%49 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi202
%50 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi203
%51 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi204
%52 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi205
%index.next = add i32 %index, 4
%53 = icmp eq i32 %index.next, %n.vec
br i1 %53, label %middle.block, label %vector.body
middle.block: ; preds = %vector.body
%54 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %52)
%55 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %51)
%56 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %50)
%57 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %49)
%58 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %48)
%59 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %47)
%60 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %46)
%61 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %45)
%arrayidx63 = getelementptr inbounds float, float* %pOut, i32 %k2.0198
store float %59, float* %arrayidx63, align 4
%arrayidx65 = getelementptr inbounds float, float* %pOut, i32 %add
store float %57, float* %arrayidx65, align 4
%arrayidx67 = getelementptr inbounds float, float* %pOut, i32 %add6
store float %55, float* %arrayidx67, align 4
%arrayidx69 = getelementptr inbounds float, float* %pOut, i32 %add8
store float %54, float* %arrayidx69, align 4
%arrayidx71 = getelementptr inbounds float, float* %pOut, i32 %add10
store float %56, float* %arrayidx71, align 4
%arrayidx73 = getelementptr inbounds float, float* %pOut, i32 %add12
store float %58, float* %arrayidx73, align 4
%arrayidx75 = getelementptr inbounds float, float* %pOut, i32 %add14
store float %60, float* %arrayidx75, align 4
%arrayidx77 = getelementptr inbounds float, float* %pOut, i32 %add16
store float %61, float* %arrayidx77, align 4
%add79 = add i32 %k2.0198, 8
%cmp3 = icmp ult i32 %add79, %sub
br i1 %cmp3, label %for.body, label %for.cond.cleanup
}
declare void @llvm.assume(i1 noundef)
declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)