mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-26 04:32:44 +01:00
c7af7bf359
With t2DoLoopDec we can be left with some extra MOV's in the preheaders of tail predicated loops. This removes them, in the same way we remove other dead variables. Differential Revision: https://reviews.llvm.org/D91857
1658 lines
77 KiB
LLVM
1658 lines
77 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
|
|
|
|
%struct.DCT_InstanceTypeDef = type { float*, i32, i32 }
|
|
|
|
define void @DCT_mve1(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
|
|
; CHECK-LABEL: DCT_mve1:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
|
|
; CHECK-NEXT: ldr r3, [r0, #4]
|
|
; CHECK-NEXT: sub.w r12, r3, #1
|
|
; CHECK-NEXT: cmp.w r12, #2
|
|
; CHECK-NEXT: blo .LBB0_5
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: ldr r5, [r0, #8]
|
|
; CHECK-NEXT: ldr r3, [r0]
|
|
; CHECK-NEXT: add.w r3, r3, r5, lsl #2
|
|
; CHECK-NEXT: movs r0, #1
|
|
; CHECK-NEXT: lsl.w r9, r5, #2
|
|
; CHECK-NEXT: .LBB0_2: @ %for.body
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
; CHECK-NEXT: @ Child Loop BB0_3 Depth 2
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: mov r6, r1
|
|
; CHECK-NEXT: mov r7, r3
|
|
; CHECK-NEXT: dlstp.32 lr, r5
|
|
; CHECK-NEXT: .LBB0_3: @ %vector.body
|
|
; CHECK-NEXT: @ Parent Loop BB0_2 Depth=1
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
; CHECK-NEXT: vldrw.u32 q1, [r6], #16
|
|
; CHECK-NEXT: vldrw.u32 q2, [r7], #16
|
|
; CHECK-NEXT: vfma.f32 q0, q2, q1
|
|
; CHECK-NEXT: letp lr, .LBB0_3
|
|
; CHECK-NEXT: @ %bb.4: @ %middle.block
|
|
; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1
|
|
; CHECK-NEXT: vadd.f32 s4, s2, s3
|
|
; CHECK-NEXT: add.w r7, r2, r0, lsl #2
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s1
|
|
; CHECK-NEXT: adds r0, #1
|
|
; CHECK-NEXT: add r3, r9
|
|
; CHECK-NEXT: cmp r0, r12
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s4
|
|
; CHECK-NEXT: vstr s0, [r7]
|
|
; CHECK-NEXT: bne .LBB0_2
|
|
; CHECK-NEXT: .LBB0_5: @ %for.cond.cleanup
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
|
|
entry:
|
|
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
|
|
%0 = load i32, i32* %NumInputs, align 4
|
|
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
|
|
%1 = load i32, i32* %NumFilters, align 4
|
|
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
|
|
%2 = load float*, float** %pDCTCoefs, align 4
|
|
%cmp = icmp ugt i32 %0, 1
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%sub = add i32 %1, -1
|
|
%cmp350 = icmp ugt i32 %sub, 1
|
|
br i1 %cmp350, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%n.rnd.up = add i32 %0, 3
|
|
%n.vec = and i32 %n.rnd.up, -4
|
|
br label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %middle.block, %entry
|
|
ret void
|
|
|
|
for.body: ; preds = %for.body.preheader, %middle.block
|
|
%k2.051 = phi i32 [ %add16, %middle.block ], [ 1, %for.body.preheader ]
|
|
%mul4 = mul i32 %k2.051, %0
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %for.body
|
|
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %10, %vector.body ]
|
|
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
|
|
%3 = getelementptr inbounds float, float* %pIn, i32 %index
|
|
%4 = bitcast float* %3 to <4 x float>*
|
|
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%5 = add i32 %index, %mul4
|
|
%6 = getelementptr inbounds float, float* %2, i32 %5
|
|
%7 = bitcast float* %6 to <4 x float>*
|
|
%wide.masked.load53 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%8 = fmul fast <4 x float> %wide.masked.load53, %wide.masked.load
|
|
%9 = fadd fast <4 x float> %8, %vec.phi
|
|
%10 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi
|
|
%index.next = add i32 %index, 4
|
|
%11 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %11, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%12 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %10)
|
|
%arrayidx14 = getelementptr inbounds float, float* %pOut, i32 %k2.051
|
|
store float %12, float* %arrayidx14, align 4
|
|
%add16 = add nuw i32 %k2.051, 1
|
|
%exitcond52.not = icmp eq i32 %add16, %sub
|
|
br i1 %exitcond52.not, label %for.cond.cleanup, label %for.body
|
|
}
|
|
|
|
define void @DCT_mve2(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
|
|
; CHECK-LABEL: DCT_mve2:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: .pad #4
|
|
; CHECK-NEXT: sub sp, #4
|
|
; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
|
|
; CHECK-NEXT: ldr r1, [r0, #4]
|
|
; CHECK-NEXT: subs r1, #2
|
|
; CHECK-NEXT: cmp r1, #2
|
|
; CHECK-NEXT: blo .LBB1_5
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: ldr.w r12, [r0, #8]
|
|
; CHECK-NEXT: movs r4, #1
|
|
; CHECK-NEXT: ldr r3, [r0]
|
|
; CHECK-NEXT: add.w r11, r3, r12, lsl #2
|
|
; CHECK-NEXT: add.w r7, r3, r12, lsl #3
|
|
; CHECK-NEXT: lsl.w r9, r12, #3
|
|
; CHECK-NEXT: .LBB1_2: @ %for.body
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
; CHECK-NEXT: @ Child Loop BB1_3 Depth 2
|
|
; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: add.w r10, r4, #1
|
|
; CHECK-NEXT: mov r3, r11
|
|
; CHECK-NEXT: mov r0, r7
|
|
; CHECK-NEXT: vmov q1, q0
|
|
; CHECK-NEXT: dlstp.32 lr, r12
|
|
; CHECK-NEXT: .LBB1_3: @ %vector.body
|
|
; CHECK-NEXT: @ Parent Loop BB1_2 Depth=1
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
; CHECK-NEXT: vldrw.u32 q2, [r5], #16
|
|
; CHECK-NEXT: vldrw.u32 q3, [r3], #16
|
|
; CHECK-NEXT: vfma.f32 q1, q3, q2
|
|
; CHECK-NEXT: vldrw.u32 q3, [r0], #16
|
|
; CHECK-NEXT: vfma.f32 q0, q3, q2
|
|
; CHECK-NEXT: letp lr, .LBB1_3
|
|
; CHECK-NEXT: @ %bb.4: @ %middle.block
|
|
; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1
|
|
; CHECK-NEXT: vadd.f32 s8, s2, s3
|
|
; CHECK-NEXT: add.w r0, r2, r10, lsl #2
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s1
|
|
; CHECK-NEXT: add r11, r9
|
|
; CHECK-NEXT: vadd.f32 s2, s6, s7
|
|
; CHECK-NEXT: add r7, r9
|
|
; CHECK-NEXT: vadd.f32 s4, s4, s5
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s8
|
|
; CHECK-NEXT: vadd.f32 s2, s4, s2
|
|
; CHECK-NEXT: vstr s0, [r0]
|
|
; CHECK-NEXT: add.w r0, r2, r4, lsl #2
|
|
; CHECK-NEXT: adds r4, #2
|
|
; CHECK-NEXT: cmp r4, r1
|
|
; CHECK-NEXT: vstr s2, [r0]
|
|
; CHECK-NEXT: blo .LBB1_2
|
|
; CHECK-NEXT: .LBB1_5: @ %for.cond.cleanup
|
|
; CHECK-NEXT: add sp, #4
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
entry:
|
|
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
|
|
%0 = load i32, i32* %NumInputs, align 4
|
|
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
|
|
%1 = load i32, i32* %NumFilters, align 4
|
|
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
|
|
%2 = load float*, float** %pDCTCoefs, align 4
|
|
%cmp = icmp ugt i32 %0, 1
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%sub = add i32 %1, -2
|
|
%cmp371 = icmp ugt i32 %sub, 1
|
|
br i1 %cmp371, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%n.rnd.up = add i32 %0, 3
|
|
%n.vec = and i32 %n.rnd.up, -4
|
|
br label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %middle.block, %entry
|
|
ret void
|
|
|
|
for.body: ; preds = %for.body.preheader, %middle.block
|
|
%k2.072 = phi i32 [ %add25, %middle.block ], [ 1, %for.body.preheader ]
|
|
%mul4 = mul i32 %k2.072, %0
|
|
%add = add nuw i32 %k2.072, 1
|
|
%mul5 = mul i32 %add, %0
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %for.body
|
|
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %15, %vector.body ]
|
|
%vec.phi73 = phi <4 x float> [ zeroinitializer, %for.body ], [ %16, %vector.body ]
|
|
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
|
|
%3 = getelementptr inbounds float, float* %pIn, i32 %index
|
|
%4 = bitcast float* %3 to <4 x float>*
|
|
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%5 = add i32 %index, %mul4
|
|
%6 = getelementptr inbounds float, float* %2, i32 %5
|
|
%7 = bitcast float* %6 to <4 x float>*
|
|
%wide.masked.load74 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%8 = fmul fast <4 x float> %wide.masked.load74, %wide.masked.load
|
|
%9 = fadd fast <4 x float> %8, %vec.phi73
|
|
%10 = add i32 %index, %mul5
|
|
%11 = getelementptr inbounds float, float* %2, i32 %10
|
|
%12 = bitcast float* %11 to <4 x float>*
|
|
%wide.masked.load75 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%13 = fmul fast <4 x float> %wide.masked.load75, %wide.masked.load
|
|
%14 = fadd fast <4 x float> %13, %vec.phi
|
|
%15 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi
|
|
%16 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi73
|
|
%index.next = add i32 %index, 4
|
|
%17 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %17, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%18 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %16)
|
|
%19 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %15)
|
|
%arrayidx21 = getelementptr inbounds float, float* %pOut, i32 %k2.072
|
|
store float %18, float* %arrayidx21, align 4
|
|
%arrayidx23 = getelementptr inbounds float, float* %pOut, i32 %add
|
|
store float %19, float* %arrayidx23, align 4
|
|
%add25 = add i32 %k2.072, 2
|
|
%cmp3 = icmp ult i32 %add25, %sub
|
|
br i1 %cmp3, label %for.body, label %for.cond.cleanup
|
|
}
|
|
|
|
define void @DCT_mve3(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
|
|
; CHECK-LABEL: DCT_mve3:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: .pad #4
|
|
; CHECK-NEXT: sub sp, #4
|
|
; CHECK-NEXT: .vsave {d8, d9}
|
|
; CHECK-NEXT: vpush {d8, d9}
|
|
; CHECK-NEXT: .pad #24
|
|
; CHECK-NEXT: sub sp, #24
|
|
; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
|
|
; CHECK-NEXT: ldr r1, [r0, #4]
|
|
; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill
|
|
; CHECK-NEXT: subs r1, #3
|
|
; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
|
|
; CHECK-NEXT: cmp r1, #2
|
|
; CHECK-NEXT: blo .LBB2_5
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: ldr r3, [r0, #8]
|
|
; CHECK-NEXT: movs r5, #1
|
|
; CHECK-NEXT: ldr r1, [r0]
|
|
; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
|
|
; CHECK-NEXT: add.w r0, r3, r3, lsl #1
|
|
; CHECK-NEXT: add.w r9, r1, r3, lsl #2
|
|
; CHECK-NEXT: add.w r12, r1, r3, lsl #3
|
|
; CHECK-NEXT: adds r3, #3
|
|
; CHECK-NEXT: bic r3, r3, #3
|
|
; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r10, r1, r0, lsl #2
|
|
; CHECK-NEXT: subs r3, #4
|
|
; CHECK-NEXT: lsl.w r11, r0, #2
|
|
; CHECK-NEXT: add.w r1, r5, r3, lsr #2
|
|
; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
|
|
; CHECK-NEXT: .LBB2_2: @ %for.body
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
; CHECK-NEXT: @ Child Loop BB2_3 Depth 2
|
|
; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
|
|
; CHECK-NEXT: adds r0, r5, #2
|
|
; CHECK-NEXT: adds r2, r5, #1
|
|
; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
|
|
; CHECK-NEXT: mov r3, r9
|
|
; CHECK-NEXT: mov r0, r12
|
|
; CHECK-NEXT: mov r4, r10
|
|
; CHECK-NEXT: vmov q2, q0
|
|
; CHECK-NEXT: vmov q1, q0
|
|
; CHECK-NEXT: dlstp.32 lr, r7
|
|
; CHECK-NEXT: .LBB2_3: @ %vector.body
|
|
; CHECK-NEXT: @ Parent Loop BB2_2 Depth=1
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
; CHECK-NEXT: vldrw.u32 q3, [r6], #16
|
|
; CHECK-NEXT: vldrw.u32 q4, [r3], #16
|
|
; CHECK-NEXT: vfma.f32 q1, q4, q3
|
|
; CHECK-NEXT: vldrw.u32 q4, [r0], #16
|
|
; CHECK-NEXT: vfma.f32 q2, q4, q3
|
|
; CHECK-NEXT: vldrw.u32 q4, [r4], #16
|
|
; CHECK-NEXT: vfma.f32 q0, q4, q3
|
|
; CHECK-NEXT: letp lr, .LBB2_3
|
|
; CHECK-NEXT: @ %bb.4: @ %middle.block
|
|
; CHECK-NEXT: @ in Loop: Header=BB2_2 Depth=1
|
|
; CHECK-NEXT: vadd.f32 s12, s10, s11
|
|
; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
|
|
; CHECK-NEXT: vadd.f32 s8, s8, s9
|
|
; CHECK-NEXT: add r9, r11
|
|
; CHECK-NEXT: vadd.f32 s10, s6, s7
|
|
; CHECK-NEXT: add.w r0, r1, r2, lsl #2
|
|
; CHECK-NEXT: vadd.f32 s4, s4, s5
|
|
; CHECK-NEXT: add r12, r11
|
|
; CHECK-NEXT: vadd.f32 s6, s2, s3
|
|
; CHECK-NEXT: add r10, r11
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s1
|
|
; CHECK-NEXT: vadd.f32 s2, s8, s12
|
|
; CHECK-NEXT: vadd.f32 s4, s4, s10
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s6
|
|
; CHECK-NEXT: vstr s2, [r0]
|
|
; CHECK-NEXT: add.w r0, r1, r5, lsl #2
|
|
; CHECK-NEXT: adds r5, #3
|
|
; CHECK-NEXT: vstr s4, [r0]
|
|
; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r0, r1, r0, lsl #2
|
|
; CHECK-NEXT: vstr s0, [r0]
|
|
; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
|
|
; CHECK-NEXT: cmp r5, r0
|
|
; CHECK-NEXT: blo .LBB2_2
|
|
; CHECK-NEXT: .LBB2_5: @ %for.cond.cleanup
|
|
; CHECK-NEXT: add sp, #24
|
|
; CHECK-NEXT: vpop {d8, d9}
|
|
; CHECK-NEXT: add sp, #4
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
entry:
|
|
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
|
|
%0 = load i32, i32* %NumInputs, align 4
|
|
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
|
|
%1 = load i32, i32* %NumFilters, align 4
|
|
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
|
|
%2 = load float*, float** %pDCTCoefs, align 4
|
|
%cmp = icmp ugt i32 %0, 1
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%sub = add i32 %1, -3
|
|
%cmp392 = icmp ugt i32 %sub, 1
|
|
br i1 %cmp392, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%n.rnd.up = add i32 %0, 3
|
|
%n.vec = and i32 %n.rnd.up, -4
|
|
br label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %middle.block, %entry
|
|
ret void
|
|
|
|
for.body: ; preds = %for.body.preheader, %middle.block
|
|
%k2.093 = phi i32 [ %add34, %middle.block ], [ 1, %for.body.preheader ]
|
|
%mul4 = mul i32 %k2.093, %0
|
|
%add = add nuw i32 %k2.093, 1
|
|
%mul5 = mul i32 %add, %0
|
|
%add6 = add i32 %k2.093, 2
|
|
%mul7 = mul i32 %add6, %0
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %for.body
|
|
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %20, %vector.body ]
|
|
%vec.phi94 = phi <4 x float> [ zeroinitializer, %for.body ], [ %21, %vector.body ]
|
|
%vec.phi95 = phi <4 x float> [ zeroinitializer, %for.body ], [ %22, %vector.body ]
|
|
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
|
|
%3 = getelementptr inbounds float, float* %pIn, i32 %index
|
|
%4 = bitcast float* %3 to <4 x float>*
|
|
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%5 = add i32 %index, %mul4
|
|
%6 = getelementptr inbounds float, float* %2, i32 %5
|
|
%7 = bitcast float* %6 to <4 x float>*
|
|
%wide.masked.load96 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%8 = fmul fast <4 x float> %wide.masked.load96, %wide.masked.load
|
|
%9 = fadd fast <4 x float> %8, %vec.phi95
|
|
%10 = add i32 %index, %mul5
|
|
%11 = getelementptr inbounds float, float* %2, i32 %10
|
|
%12 = bitcast float* %11 to <4 x float>*
|
|
%wide.masked.load97 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%13 = fmul fast <4 x float> %wide.masked.load97, %wide.masked.load
|
|
%14 = fadd fast <4 x float> %13, %vec.phi94
|
|
%15 = add i32 %index, %mul7
|
|
%16 = getelementptr inbounds float, float* %2, i32 %15
|
|
%17 = bitcast float* %16 to <4 x float>*
|
|
%wide.masked.load98 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%18 = fmul fast <4 x float> %wide.masked.load98, %wide.masked.load
|
|
%19 = fadd fast <4 x float> %18, %vec.phi
|
|
%20 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi
|
|
%21 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi94
|
|
%22 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi95
|
|
%index.next = add i32 %index, 4
|
|
%23 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %23, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%24 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %22)
|
|
%25 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %21)
|
|
%26 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %20)
|
|
%arrayidx28 = getelementptr inbounds float, float* %pOut, i32 %k2.093
|
|
store float %24, float* %arrayidx28, align 4
|
|
%arrayidx30 = getelementptr inbounds float, float* %pOut, i32 %add
|
|
store float %25, float* %arrayidx30, align 4
|
|
%arrayidx32 = getelementptr inbounds float, float* %pOut, i32 %add6
|
|
store float %26, float* %arrayidx32, align 4
|
|
%add34 = add i32 %k2.093, 3
|
|
%cmp3 = icmp ult i32 %add34, %sub
|
|
br i1 %cmp3, label %for.body, label %for.cond.cleanup
|
|
}
|
|
|
|
define void @DCT_mve4(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
|
|
; CHECK-LABEL: DCT_mve4:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: .pad #4
|
|
; CHECK-NEXT: sub sp, #4
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11}
|
|
; CHECK-NEXT: .pad #40
|
|
; CHECK-NEXT: sub sp, #40
|
|
; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
|
|
; CHECK-NEXT: ldr r1, [r0, #4]
|
|
; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill
|
|
; CHECK-NEXT: subs r1, #4
|
|
; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
|
|
; CHECK-NEXT: cmp r1, #2
|
|
; CHECK-NEXT: blo.w .LBB3_5
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: ldr r2, [r0, #8]
|
|
; CHECK-NEXT: movs r6, #1
|
|
; CHECK-NEXT: ldr r1, [r0]
|
|
; CHECK-NEXT: add.w r0, r2, r2, lsl #1
|
|
; CHECK-NEXT: add.w r12, r1, r2, lsl #2
|
|
; CHECK-NEXT: add.w r8, r1, r2, lsl #3
|
|
; CHECK-NEXT: add.w r9, r1, r2, lsl #4
|
|
; CHECK-NEXT: add.w r11, r1, r0, lsl #2
|
|
; CHECK-NEXT: adds r0, r2, #3
|
|
; CHECK-NEXT: bic r0, r0, #3
|
|
; CHECK-NEXT: subs r0, #4
|
|
; CHECK-NEXT: add.w r0, r6, r0, lsr #2
|
|
; CHECK-NEXT: strd r0, r2, [sp, #8] @ 8-byte Folded Spill
|
|
; CHECK-NEXT: lsls r0, r2, #4
|
|
; CHECK-NEXT: ldrd r2, r7, [sp, #8] @ 8-byte Folded Reload
|
|
; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill
|
|
; CHECK-NEXT: .LBB3_2: @ %for.body
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
; CHECK-NEXT: @ Child Loop BB3_3 Depth 2
|
|
; CHECK-NEXT: adds r0, r6, #3
|
|
; CHECK-NEXT: str r0, [sp, #36] @ 4-byte Spill
|
|
; CHECK-NEXT: adds r0, r6, #2
|
|
; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
; CHECK-NEXT: str r0, [sp, #32] @ 4-byte Spill
|
|
; CHECK-NEXT: adds r0, r6, #1
|
|
; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill
|
|
; CHECK-NEXT: mov r3, r12
|
|
; CHECK-NEXT: mov r0, r8
|
|
; CHECK-NEXT: mov r5, r11
|
|
; CHECK-NEXT: mov r4, r9
|
|
; CHECK-NEXT: vmov q1, q0
|
|
; CHECK-NEXT: vmov q2, q0
|
|
; CHECK-NEXT: vmov q3, q0
|
|
; CHECK-NEXT: dlstp.32 lr, r7
|
|
; CHECK-NEXT: .LBB3_3: @ %vector.body
|
|
; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
; CHECK-NEXT: vldrw.u32 q4, [r1], #16
|
|
; CHECK-NEXT: vldrw.u32 q5, [r0], #16
|
|
; CHECK-NEXT: vfma.f32 q3, q5, q4
|
|
; CHECK-NEXT: vldrw.u32 q5, [r3], #16
|
|
; CHECK-NEXT: vfma.f32 q2, q5, q4
|
|
; CHECK-NEXT: vldrw.u32 q5, [r5], #16
|
|
; CHECK-NEXT: vfma.f32 q1, q5, q4
|
|
; CHECK-NEXT: vldrw.u32 q5, [r4], #16
|
|
; CHECK-NEXT: vfma.f32 q0, q5, q4
|
|
; CHECK-NEXT: letp lr, .LBB3_3
|
|
; CHECK-NEXT: @ %bb.4: @ %middle.block
|
|
; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1
|
|
; CHECK-NEXT: vadd.f32 s16, s14, s15
|
|
; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload
|
|
; CHECK-NEXT: vadd.f32 s12, s12, s13
|
|
; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
|
|
; CHECK-NEXT: vadd.f32 s14, s10, s11
|
|
; CHECK-NEXT: vadd.f32 s8, s8, s9
|
|
; CHECK-NEXT: add.w r0, r1, r0, lsl #2
|
|
; CHECK-NEXT: vadd.f32 s10, s6, s7
|
|
; CHECK-NEXT: vadd.f32 s4, s4, s5
|
|
; CHECK-NEXT: vadd.f32 s6, s2, s3
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s1
|
|
; CHECK-NEXT: vadd.f32 s2, s12, s16
|
|
; CHECK-NEXT: vadd.f32 s8, s8, s14
|
|
; CHECK-NEXT: vadd.f32 s4, s4, s10
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s6
|
|
; CHECK-NEXT: vstr s2, [r0]
|
|
; CHECK-NEXT: add.w r0, r1, r6, lsl #2
|
|
; CHECK-NEXT: adds r6, #4
|
|
; CHECK-NEXT: vstr s8, [r0]
|
|
; CHECK-NEXT: ldr r0, [sp, #32] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r0, r1, r0, lsl #2
|
|
; CHECK-NEXT: vstr s4, [r0]
|
|
; CHECK-NEXT: ldr r0, [sp, #36] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r0, r1, r0, lsl #2
|
|
; CHECK-NEXT: vstr s0, [r0]
|
|
; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
|
|
; CHECK-NEXT: add r12, r0
|
|
; CHECK-NEXT: add r8, r0
|
|
; CHECK-NEXT: add r11, r0
|
|
; CHECK-NEXT: add r9, r0
|
|
; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
|
|
; CHECK-NEXT: cmp r6, r0
|
|
; CHECK-NEXT: blo .LBB3_2
|
|
; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup
|
|
; CHECK-NEXT: add sp, #40
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11}
|
|
; CHECK-NEXT: add sp, #4
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
entry:
|
|
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
|
|
%0 = load i32, i32* %NumInputs, align 4
|
|
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
|
|
%1 = load i32, i32* %NumFilters, align 4
|
|
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
|
|
%2 = load float*, float** %pDCTCoefs, align 4
|
|
%cmp = icmp ugt i32 %0, 1
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%sub = add i32 %1, -4
|
|
%cmp3113 = icmp ugt i32 %sub, 1
|
|
br i1 %cmp3113, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%n.rnd.up = add i32 %0, 3
|
|
%n.vec = and i32 %n.rnd.up, -4
|
|
br label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %middle.block, %entry
|
|
ret void
|
|
|
|
for.body: ; preds = %for.body.preheader, %middle.block
|
|
%k2.0114 = phi i32 [ %add43, %middle.block ], [ 1, %for.body.preheader ]
|
|
%mul4 = mul i32 %k2.0114, %0
|
|
%add = add nuw nsw i32 %k2.0114, 1
|
|
%mul5 = mul i32 %add, %0
|
|
%add6 = add nuw nsw i32 %k2.0114, 2
|
|
%mul7 = mul i32 %add6, %0
|
|
%add8 = add i32 %k2.0114, 3
|
|
%mul9 = mul i32 %add8, %0
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %for.body
|
|
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %25, %vector.body ]
|
|
%vec.phi115 = phi <4 x float> [ zeroinitializer, %for.body ], [ %26, %vector.body ]
|
|
%vec.phi116 = phi <4 x float> [ zeroinitializer, %for.body ], [ %27, %vector.body ]
|
|
%vec.phi117 = phi <4 x float> [ zeroinitializer, %for.body ], [ %28, %vector.body ]
|
|
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
|
|
%3 = getelementptr inbounds float, float* %pIn, i32 %index
|
|
%4 = bitcast float* %3 to <4 x float>*
|
|
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%5 = add i32 %index, %mul4
|
|
%6 = getelementptr inbounds float, float* %2, i32 %5
|
|
%7 = bitcast float* %6 to <4 x float>*
|
|
%wide.masked.load118 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%8 = fmul fast <4 x float> %wide.masked.load118, %wide.masked.load
|
|
%9 = fadd fast <4 x float> %8, %vec.phi116
|
|
%10 = add i32 %index, %mul5
|
|
%11 = getelementptr inbounds float, float* %2, i32 %10
|
|
%12 = bitcast float* %11 to <4 x float>*
|
|
%wide.masked.load119 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%13 = fmul fast <4 x float> %wide.masked.load119, %wide.masked.load
|
|
%14 = fadd fast <4 x float> %13, %vec.phi117
|
|
%15 = add i32 %index, %mul7
|
|
%16 = getelementptr inbounds float, float* %2, i32 %15
|
|
%17 = bitcast float* %16 to <4 x float>*
|
|
%wide.masked.load120 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%18 = fmul fast <4 x float> %wide.masked.load120, %wide.masked.load
|
|
%19 = fadd fast <4 x float> %18, %vec.phi115
|
|
%20 = add i32 %index, %mul9
|
|
%21 = getelementptr inbounds float, float* %2, i32 %20
|
|
%22 = bitcast float* %21 to <4 x float>*
|
|
%wide.masked.load121 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%23 = fmul fast <4 x float> %wide.masked.load121, %wide.masked.load
|
|
%24 = fadd fast <4 x float> %23, %vec.phi
|
|
%25 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi
|
|
%26 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi115
|
|
%27 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi116
|
|
%28 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi117
|
|
%index.next = add i32 %index, 4
|
|
%29 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %29, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%30 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %28)
|
|
%31 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %27)
|
|
%32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %26)
|
|
%33 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %25)
|
|
%arrayidx35 = getelementptr inbounds float, float* %pOut, i32 %k2.0114
|
|
store float %31, float* %arrayidx35, align 4
|
|
%arrayidx37 = getelementptr inbounds float, float* %pOut, i32 %add
|
|
store float %30, float* %arrayidx37, align 4
|
|
%arrayidx39 = getelementptr inbounds float, float* %pOut, i32 %add6
|
|
store float %32, float* %arrayidx39, align 4
|
|
%arrayidx41 = getelementptr inbounds float, float* %pOut, i32 %add8
|
|
store float %33, float* %arrayidx41, align 4
|
|
%add43 = add i32 %k2.0114, 4
|
|
%cmp3 = icmp ult i32 %add43, %sub
|
|
br i1 %cmp3, label %for.body, label %for.cond.cleanup
|
|
}
|
|
|
|
define void @DCT_mve5(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
|
|
; CHECK-LABEL: DCT_mve5:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: .pad #4
|
|
; CHECK-NEXT: sub sp, #4
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
|
|
; CHECK-NEXT: .pad #32
|
|
; CHECK-NEXT: sub sp, #32
|
|
; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
|
|
; CHECK-NEXT: ldr r1, [r0, #4]
|
|
; CHECK-NEXT: subs r1, #5
|
|
; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
|
|
; CHECK-NEXT: cmp r1, #2
|
|
; CHECK-NEXT: blo.w .LBB4_5
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: ldr r3, [r0, #8]
|
|
; CHECK-NEXT: ldr r1, [r0]
|
|
; CHECK-NEXT: adds r0, r3, #3
|
|
; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
|
|
; CHECK-NEXT: bic r0, r0, #3
|
|
; CHECK-NEXT: add.w r8, r1, r3, lsl #2
|
|
; CHECK-NEXT: subs r1, r0, #4
|
|
; CHECK-NEXT: movs r0, #1
|
|
; CHECK-NEXT: lsls r5, r3, #2
|
|
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
|
|
; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
|
|
; CHECK-NEXT: add.w r1, r3, r3, lsl #2
|
|
; CHECK-NEXT: lsls r1, r1, #2
|
|
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
|
|
; CHECK-NEXT: .LBB4_2: @ %for.body
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
; CHECK-NEXT: @ Child Loop BB4_3 Depth 2
|
|
; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload
|
|
; CHECK-NEXT: adds r1, r0, #4
|
|
; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload
|
|
; CHECK-NEXT: vmov.i32 q1, #0x0
|
|
; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r10, r0, #2
|
|
; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
|
|
; CHECK-NEXT: adds r1, r0, #3
|
|
; CHECK-NEXT: add.w r11, r0, #1
|
|
; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
|
|
; CHECK-NEXT: mov r3, r8
|
|
; CHECK-NEXT: vmov q0, q1
|
|
; CHECK-NEXT: vmov q3, q1
|
|
; CHECK-NEXT: vmov q2, q1
|
|
; CHECK-NEXT: vmov q4, q1
|
|
; CHECK-NEXT: dlstp.32 lr, r7
|
|
; CHECK-NEXT: .LBB4_3: @ %vector.body
|
|
; CHECK-NEXT: @ Parent Loop BB4_2 Depth=1
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
; CHECK-NEXT: add.w r9, r3, r5
|
|
; CHECK-NEXT: vldrw.u32 q5, [r4], #16
|
|
; CHECK-NEXT: vldrw.u32 q6, [r3], #16
|
|
; CHECK-NEXT: vfma.f32 q3, q6, q5
|
|
; CHECK-NEXT: add.w r12, r9, r5
|
|
; CHECK-NEXT: vldrw.u32 q6, [r9]
|
|
; CHECK-NEXT: vfma.f32 q4, q6, q5
|
|
; CHECK-NEXT: add.w r6, r12, r5
|
|
; CHECK-NEXT: vldrw.u32 q6, [r12]
|
|
; CHECK-NEXT: vfma.f32 q2, q6, q5
|
|
; CHECK-NEXT: adds r7, r6, r5
|
|
; CHECK-NEXT: vldrw.u32 q6, [r6]
|
|
; CHECK-NEXT: vfma.f32 q0, q6, q5
|
|
; CHECK-NEXT: vldrw.u32 q6, [r7]
|
|
; CHECK-NEXT: vfma.f32 q1, q6, q5
|
|
; CHECK-NEXT: letp lr, .LBB4_3
|
|
; CHECK-NEXT: @ %bb.4: @ %middle.block
|
|
; CHECK-NEXT: @ in Loop: Header=BB4_2 Depth=1
|
|
; CHECK-NEXT: vadd.f32 s20, s18, s19
|
|
; CHECK-NEXT: add.w r1, r2, r11, lsl #2
|
|
; CHECK-NEXT: vadd.f32 s16, s16, s17
|
|
; CHECK-NEXT: vadd.f32 s18, s14, s15
|
|
; CHECK-NEXT: vadd.f32 s12, s12, s13
|
|
; CHECK-NEXT: vadd.f32 s14, s6, s7
|
|
; CHECK-NEXT: vadd.f32 s4, s4, s5
|
|
; CHECK-NEXT: vadd.f32 s6, s10, s11
|
|
; CHECK-NEXT: vadd.f32 s8, s8, s9
|
|
; CHECK-NEXT: vadd.f32 s10, s2, s3
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s1
|
|
; CHECK-NEXT: vadd.f32 s2, s16, s20
|
|
; CHECK-NEXT: vadd.f32 s12, s12, s18
|
|
; CHECK-NEXT: vadd.f32 s4, s4, s14
|
|
; CHECK-NEXT: vadd.f32 s6, s8, s6
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s10
|
|
; CHECK-NEXT: vstr s2, [r1]
|
|
; CHECK-NEXT: add.w r1, r2, r0, lsl #2
|
|
; CHECK-NEXT: adds r0, #5
|
|
; CHECK-NEXT: vstr s12, [r1]
|
|
; CHECK-NEXT: add.w r1, r2, r10, lsl #2
|
|
; CHECK-NEXT: vstr s6, [r1]
|
|
; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
; CHECK-NEXT: vstr s0, [r1]
|
|
; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
; CHECK-NEXT: vstr s4, [r1]
|
|
; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
|
|
; CHECK-NEXT: add r8, r1
|
|
; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
|
|
; CHECK-NEXT: cmp r0, r1
|
|
; CHECK-NEXT: blo.w .LBB4_2
|
|
; CHECK-NEXT: .LBB4_5: @ %for.cond.cleanup
|
|
; CHECK-NEXT: add sp, #32
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
|
|
; CHECK-NEXT: add sp, #4
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
entry:
|
|
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
|
|
%0 = load i32, i32* %NumInputs, align 4
|
|
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
|
|
%1 = load i32, i32* %NumFilters, align 4
|
|
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
|
|
%2 = load float*, float** %pDCTCoefs, align 4
|
|
%cmp = icmp ugt i32 %0, 1
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%sub = add i32 %1, -5
|
|
%cmp3134 = icmp ugt i32 %sub, 1
|
|
br i1 %cmp3134, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%n.rnd.up = add i32 %0, 3
|
|
%n.vec = and i32 %n.rnd.up, -4
|
|
br label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %middle.block, %entry
|
|
ret void
|
|
|
|
for.body: ; preds = %for.body.preheader, %middle.block
|
|
%k2.0135 = phi i32 [ %add52, %middle.block ], [ 1, %for.body.preheader ]
|
|
%mul4 = mul i32 %k2.0135, %0
|
|
%add = add nuw i32 %k2.0135, 1
|
|
%mul5 = mul i32 %add, %0
|
|
%add6 = add i32 %k2.0135, 2
|
|
%mul7 = mul i32 %add6, %0
|
|
%add8 = add i32 %k2.0135, 3
|
|
%mul9 = mul i32 %add8, %0
|
|
%add10 = add i32 %k2.0135, 4
|
|
%mul11 = mul i32 %add10, %0
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %for.body
|
|
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %30, %vector.body ]
|
|
%vec.phi136 = phi <4 x float> [ zeroinitializer, %for.body ], [ %31, %vector.body ]
|
|
%vec.phi137 = phi <4 x float> [ zeroinitializer, %for.body ], [ %32, %vector.body ]
|
|
%vec.phi138 = phi <4 x float> [ zeroinitializer, %for.body ], [ %33, %vector.body ]
|
|
%vec.phi139 = phi <4 x float> [ zeroinitializer, %for.body ], [ %34, %vector.body ]
|
|
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
|
|
%3 = getelementptr inbounds float, float* %pIn, i32 %index
|
|
%4 = bitcast float* %3 to <4 x float>*
|
|
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%5 = add i32 %index, %mul4
|
|
%6 = getelementptr inbounds float, float* %2, i32 %5
|
|
%7 = bitcast float* %6 to <4 x float>*
|
|
%wide.masked.load140 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%8 = fmul fast <4 x float> %wide.masked.load140, %wide.masked.load
|
|
%9 = fadd fast <4 x float> %8, %vec.phi137
|
|
%10 = add i32 %index, %mul5
|
|
%11 = getelementptr inbounds float, float* %2, i32 %10
|
|
%12 = bitcast float* %11 to <4 x float>*
|
|
%wide.masked.load141 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%13 = fmul fast <4 x float> %wide.masked.load141, %wide.masked.load
|
|
%14 = fadd fast <4 x float> %13, %vec.phi139
|
|
%15 = add i32 %index, %mul7
|
|
%16 = getelementptr inbounds float, float* %2, i32 %15
|
|
%17 = bitcast float* %16 to <4 x float>*
|
|
%wide.masked.load142 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%18 = fmul fast <4 x float> %wide.masked.load142, %wide.masked.load
|
|
%19 = fadd fast <4 x float> %18, %vec.phi138
|
|
%20 = add i32 %index, %mul9
|
|
%21 = getelementptr inbounds float, float* %2, i32 %20
|
|
%22 = bitcast float* %21 to <4 x float>*
|
|
%wide.masked.load143 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%23 = fmul fast <4 x float> %wide.masked.load143, %wide.masked.load
|
|
%24 = fadd fast <4 x float> %23, %vec.phi136
|
|
%25 = add i32 %index, %mul11
|
|
%26 = getelementptr inbounds float, float* %2, i32 %25
|
|
%27 = bitcast float* %26 to <4 x float>*
|
|
%wide.masked.load144 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%28 = fmul fast <4 x float> %wide.masked.load144, %wide.masked.load
|
|
%29 = fadd fast <4 x float> %28, %vec.phi
|
|
%30 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi
|
|
%31 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi136
|
|
%32 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi137
|
|
%33 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi138
|
|
%34 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi139
|
|
%index.next = add i32 %index, 4
|
|
%35 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %35, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%36 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %34)
|
|
%37 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %33)
|
|
%38 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %32)
|
|
%39 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %31)
|
|
%40 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %30)
|
|
%arrayidx42 = getelementptr inbounds float, float* %pOut, i32 %k2.0135
|
|
store float %38, float* %arrayidx42, align 4
|
|
%arrayidx44 = getelementptr inbounds float, float* %pOut, i32 %add
|
|
store float %36, float* %arrayidx44, align 4
|
|
%arrayidx46 = getelementptr inbounds float, float* %pOut, i32 %add6
|
|
store float %37, float* %arrayidx46, align 4
|
|
%arrayidx48 = getelementptr inbounds float, float* %pOut, i32 %add8
|
|
store float %39, float* %arrayidx48, align 4
|
|
%arrayidx50 = getelementptr inbounds float, float* %pOut, i32 %add10
|
|
store float %40, float* %arrayidx50, align 4
|
|
%add52 = add i32 %k2.0135, 5
|
|
%cmp3 = icmp ult i32 %add52, %sub
|
|
br i1 %cmp3, label %for.body, label %for.cond.cleanup
|
|
}
|
|
|
|
define void @DCT_mve6(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
|
|
; CHECK-LABEL: DCT_mve6:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: .pad #4
|
|
; CHECK-NEXT: sub sp, #4
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
; CHECK-NEXT: .pad #32
|
|
; CHECK-NEXT: sub sp, #32
|
|
; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
|
|
; CHECK-NEXT: ldr r1, [r0, #4]
|
|
; CHECK-NEXT: subs r1, #6
|
|
; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
|
|
; CHECK-NEXT: cmp r1, #2
|
|
; CHECK-NEXT: blo.w .LBB5_5
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: ldr r3, [r0, #8]
|
|
; CHECK-NEXT: ldr r1, [r0]
|
|
; CHECK-NEXT: adds r0, r3, #3
|
|
; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
|
|
; CHECK-NEXT: bic r0, r0, #3
|
|
; CHECK-NEXT: add.w r8, r1, r3, lsl #2
|
|
; CHECK-NEXT: subs r1, r0, #4
|
|
; CHECK-NEXT: movs r0, #1
|
|
; CHECK-NEXT: lsls r5, r3, #2
|
|
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
|
|
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
|
|
; CHECK-NEXT: add.w r1, r3, r3, lsl #1
|
|
; CHECK-NEXT: lsls r1, r1, #3
|
|
; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
|
|
; CHECK-NEXT: .LBB5_2: @ %for.body
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
; CHECK-NEXT: @ Child Loop BB5_3 Depth 2
|
|
; CHECK-NEXT: adds r1, r0, #5
|
|
; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
|
|
; CHECK-NEXT: adds r1, r0, #4
|
|
; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
|
|
; CHECK-NEXT: adds r1, r0, #3
|
|
; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
|
|
; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
|
|
; CHECK-NEXT: vmov.i32 q1, #0x0
|
|
; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r11, r0, #2
|
|
; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
|
|
; CHECK-NEXT: adds r4, r0, #1
|
|
; CHECK-NEXT: mov r3, r8
|
|
; CHECK-NEXT: vmov q3, q1
|
|
; CHECK-NEXT: vmov q4, q1
|
|
; CHECK-NEXT: vmov q0, q1
|
|
; CHECK-NEXT: vmov q5, q1
|
|
; CHECK-NEXT: vmov q2, q1
|
|
; CHECK-NEXT: dlstp.32 lr, r7
|
|
; CHECK-NEXT: .LBB5_3: @ %vector.body
|
|
; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
; CHECK-NEXT: add.w r12, r3, r5
|
|
; CHECK-NEXT: vldrw.u32 q6, [r1], #16
|
|
; CHECK-NEXT: vldrw.u32 q7, [r3], #16
|
|
; CHECK-NEXT: vfma.f32 q4, q7, q6
|
|
; CHECK-NEXT: add.w r10, r12, r5
|
|
; CHECK-NEXT: vldrw.u32 q7, [r12]
|
|
; CHECK-NEXT: vfma.f32 q5, q7, q6
|
|
; CHECK-NEXT: add.w r6, r10, r5
|
|
; CHECK-NEXT: vldrw.u32 q7, [r10]
|
|
; CHECK-NEXT: vfma.f32 q2, q7, q6
|
|
; CHECK-NEXT: adds r7, r6, r5
|
|
; CHECK-NEXT: vldrw.u32 q7, [r6]
|
|
; CHECK-NEXT: vfma.f32 q0, q7, q6
|
|
; CHECK-NEXT: adds r6, r7, r5
|
|
; CHECK-NEXT: vldrw.u32 q7, [r7]
|
|
; CHECK-NEXT: vfma.f32 q3, q7, q6
|
|
; CHECK-NEXT: vldrw.u32 q7, [r6]
|
|
; CHECK-NEXT: vfma.f32 q1, q7, q6
|
|
; CHECK-NEXT: letp lr, .LBB5_3
|
|
; CHECK-NEXT: @ %bb.4: @ %middle.block
|
|
; CHECK-NEXT: @ in Loop: Header=BB5_2 Depth=1
|
|
; CHECK-NEXT: vadd.f32 s24, s22, s23
|
|
; CHECK-NEXT: add.w r1, r2, r4, lsl #2
|
|
; CHECK-NEXT: vadd.f32 s20, s20, s21
|
|
; CHECK-NEXT: vadd.f32 s22, s18, s19
|
|
; CHECK-NEXT: vadd.f32 s16, s16, s17
|
|
; CHECK-NEXT: vadd.f32 s18, s6, s7
|
|
; CHECK-NEXT: vadd.f32 s4, s4, s5
|
|
; CHECK-NEXT: vadd.f32 s6, s14, s15
|
|
; CHECK-NEXT: vadd.f32 s12, s12, s13
|
|
; CHECK-NEXT: vadd.f32 s14, s10, s11
|
|
; CHECK-NEXT: vadd.f32 s8, s8, s9
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s1
|
|
; CHECK-NEXT: vadd.f32 s10, s2, s3
|
|
; CHECK-NEXT: vadd.f32 s2, s20, s24
|
|
; CHECK-NEXT: vadd.f32 s1, s16, s22
|
|
; CHECK-NEXT: vadd.f32 s6, s12, s6
|
|
; CHECK-NEXT: vadd.f32 s4, s4, s18
|
|
; CHECK-NEXT: vadd.f32 s8, s8, s14
|
|
; CHECK-NEXT: vadd.f32 s0, s0, s10
|
|
; CHECK-NEXT: vstr s2, [r1]
|
|
; CHECK-NEXT: add.w r1, r2, r0, lsl #2
|
|
; CHECK-NEXT: adds r0, #6
|
|
; CHECK-NEXT: vstr s1, [r1]
|
|
; CHECK-NEXT: add.w r1, r2, r11, lsl #2
|
|
; CHECK-NEXT: vstr s8, [r1]
|
|
; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
; CHECK-NEXT: vstr s0, [r1]
|
|
; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
; CHECK-NEXT: vstr s6, [r1]
|
|
; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
; CHECK-NEXT: vstr s4, [r1]
|
|
; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
|
|
; CHECK-NEXT: add r8, r1
|
|
; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
|
|
; CHECK-NEXT: cmp r0, r1
|
|
; CHECK-NEXT: blo.w .LBB5_2
|
|
; CHECK-NEXT: .LBB5_5: @ %for.cond.cleanup
|
|
; CHECK-NEXT: add sp, #32
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
; CHECK-NEXT: add sp, #4
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
entry:
|
|
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
|
|
%0 = load i32, i32* %NumInputs, align 4
|
|
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
|
|
%1 = load i32, i32* %NumFilters, align 4
|
|
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
|
|
%2 = load float*, float** %pDCTCoefs, align 4
|
|
%cmp = icmp ugt i32 %0, 1
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%sub = add i32 %1, -6
|
|
%cmp3155 = icmp ugt i32 %sub, 1
|
|
br i1 %cmp3155, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%n.rnd.up = add i32 %0, 3
|
|
%n.vec = and i32 %n.rnd.up, -4
|
|
br label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %middle.block, %entry
|
|
ret void
|
|
|
|
for.body: ; preds = %for.body.preheader, %middle.block
|
|
%k2.0156 = phi i32 [ %add61, %middle.block ], [ 1, %for.body.preheader ]
|
|
%mul4 = mul i32 %k2.0156, %0
|
|
%add = add nuw i32 %k2.0156, 1
|
|
%mul5 = mul i32 %add, %0
|
|
%add6 = add i32 %k2.0156, 2
|
|
%mul7 = mul i32 %add6, %0
|
|
%add8 = add i32 %k2.0156, 3
|
|
%mul9 = mul i32 %add8, %0
|
|
%add10 = add i32 %k2.0156, 4
|
|
%mul11 = mul i32 %add10, %0
|
|
%add12 = add i32 %k2.0156, 5
|
|
%mul13 = mul i32 %add12, %0
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %for.body
|
|
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %35, %vector.body ]
|
|
%vec.phi157 = phi <4 x float> [ zeroinitializer, %for.body ], [ %36, %vector.body ]
|
|
%vec.phi158 = phi <4 x float> [ zeroinitializer, %for.body ], [ %37, %vector.body ]
|
|
%vec.phi159 = phi <4 x float> [ zeroinitializer, %for.body ], [ %38, %vector.body ]
|
|
%vec.phi160 = phi <4 x float> [ zeroinitializer, %for.body ], [ %39, %vector.body ]
|
|
%vec.phi161 = phi <4 x float> [ zeroinitializer, %for.body ], [ %40, %vector.body ]
|
|
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
|
|
%3 = getelementptr inbounds float, float* %pIn, i32 %index
|
|
%4 = bitcast float* %3 to <4 x float>*
|
|
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%5 = add i32 %index, %mul4
|
|
%6 = getelementptr inbounds float, float* %2, i32 %5
|
|
%7 = bitcast float* %6 to <4 x float>*
|
|
%wide.masked.load162 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%8 = fmul fast <4 x float> %wide.masked.load162, %wide.masked.load
|
|
%9 = fadd fast <4 x float> %8, %vec.phi158
|
|
%10 = add i32 %index, %mul5
|
|
%11 = getelementptr inbounds float, float* %2, i32 %10
|
|
%12 = bitcast float* %11 to <4 x float>*
|
|
%wide.masked.load163 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%13 = fmul fast <4 x float> %wide.masked.load163, %wide.masked.load
|
|
%14 = fadd fast <4 x float> %13, %vec.phi160
|
|
%15 = add i32 %index, %mul7
|
|
%16 = getelementptr inbounds float, float* %2, i32 %15
|
|
%17 = bitcast float* %16 to <4 x float>*
|
|
%wide.masked.load164 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%18 = fmul fast <4 x float> %wide.masked.load164, %wide.masked.load
|
|
%19 = fadd fast <4 x float> %18, %vec.phi161
|
|
%20 = add i32 %index, %mul9
|
|
%21 = getelementptr inbounds float, float* %2, i32 %20
|
|
%22 = bitcast float* %21 to <4 x float>*
|
|
%wide.masked.load165 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%23 = fmul fast <4 x float> %wide.masked.load165, %wide.masked.load
|
|
%24 = fadd fast <4 x float> %23, %vec.phi159
|
|
%25 = add i32 %index, %mul11
|
|
%26 = getelementptr inbounds float, float* %2, i32 %25
|
|
%27 = bitcast float* %26 to <4 x float>*
|
|
%wide.masked.load166 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%28 = fmul fast <4 x float> %wide.masked.load166, %wide.masked.load
|
|
%29 = fadd fast <4 x float> %28, %vec.phi157
|
|
%30 = add i32 %index, %mul13
|
|
%31 = getelementptr inbounds float, float* %2, i32 %30
|
|
%32 = bitcast float* %31 to <4 x float>*
|
|
%wide.masked.load167 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%33 = fmul fast <4 x float> %wide.masked.load167, %wide.masked.load
|
|
%34 = fadd fast <4 x float> %33, %vec.phi
|
|
%35 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi
|
|
%36 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi157
|
|
%37 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi158
|
|
%38 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi159
|
|
%39 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi160
|
|
%40 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi161
|
|
%index.next = add i32 %index, 4
|
|
%41 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %41, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%42 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %40)
|
|
%43 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %39)
|
|
%44 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %38)
|
|
%45 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %37)
|
|
%46 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %36)
|
|
%47 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %35)
|
|
%arrayidx49 = getelementptr inbounds float, float* %pOut, i32 %k2.0156
|
|
store float %45, float* %arrayidx49, align 4
|
|
%arrayidx51 = getelementptr inbounds float, float* %pOut, i32 %add
|
|
store float %43, float* %arrayidx51, align 4
|
|
%arrayidx53 = getelementptr inbounds float, float* %pOut, i32 %add6
|
|
store float %42, float* %arrayidx53, align 4
|
|
%arrayidx55 = getelementptr inbounds float, float* %pOut, i32 %add8
|
|
store float %44, float* %arrayidx55, align 4
|
|
%arrayidx57 = getelementptr inbounds float, float* %pOut, i32 %add10
|
|
store float %46, float* %arrayidx57, align 4
|
|
%arrayidx59 = getelementptr inbounds float, float* %pOut, i32 %add12
|
|
store float %47, float* %arrayidx59, align 4
|
|
%add61 = add i32 %k2.0156, 6
|
|
%cmp3 = icmp ult i32 %add61, %sub
|
|
br i1 %cmp3, label %for.body, label %for.cond.cleanup
|
|
}
|
|
|
|
define void @DCT_mve7(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
|
|
; CHECK-LABEL: DCT_mve7:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: .pad #4
|
|
; CHECK-NEXT: sub sp, #4
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
; CHECK-NEXT: .pad #88
|
|
; CHECK-NEXT: sub sp, #88
|
|
; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
|
|
; CHECK-NEXT: ldr r1, [r0, #4]
|
|
; CHECK-NEXT: subs r1, #7
|
|
; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
|
|
; CHECK-NEXT: cmp r1, #2
|
|
; CHECK-NEXT: blo.w .LBB6_5
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: ldr r3, [r0, #8]
|
|
; CHECK-NEXT: ldr r1, [r0]
|
|
; CHECK-NEXT: adds r0, r3, #3
|
|
; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill
|
|
; CHECK-NEXT: bic r0, r0, #3
|
|
; CHECK-NEXT: add.w r9, r1, r3, lsl #2
|
|
; CHECK-NEXT: subs r1, r0, #4
|
|
; CHECK-NEXT: movs r0, #1
|
|
; CHECK-NEXT: lsls r5, r3, #2
|
|
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
|
|
; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
|
|
; CHECK-NEXT: rsb r1, r3, r3, lsl #3
|
|
; CHECK-NEXT: lsls r1, r1, #2
|
|
; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
|
|
; CHECK-NEXT: .LBB6_2: @ %for.body
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
; CHECK-NEXT: @ Child Loop BB6_3 Depth 2
|
|
; CHECK-NEXT: adds r1, r0, #6
|
|
; CHECK-NEXT: str r1, [sp, #44] @ 4-byte Spill
|
|
; CHECK-NEXT: adds r1, r0, #5
|
|
; CHECK-NEXT: str r1, [sp, #40] @ 4-byte Spill
|
|
; CHECK-NEXT: adds r1, r0, #4
|
|
; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill
|
|
; CHECK-NEXT: adds r1, r0, #3
|
|
; CHECK-NEXT: ldr r7, [sp, #20] @ 4-byte Reload
|
|
; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill
|
|
; CHECK-NEXT: vmov.i32 q2, #0x0
|
|
; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
|
|
; CHECK-NEXT: adds r4, r0, #2
|
|
; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r8, r0, #1
|
|
; CHECK-NEXT: mov r3, r9
|
|
; CHECK-NEXT: vmov q4, q2
|
|
; CHECK-NEXT: vmov q5, q2
|
|
; CHECK-NEXT: vmov q3, q2
|
|
; CHECK-NEXT: vmov q6, q2
|
|
; CHECK-NEXT: vmov q1, q2
|
|
; CHECK-NEXT: mov r12, r7
|
|
; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill
|
|
; CHECK-NEXT: dls lr, r6
|
|
; CHECK-NEXT: .LBB6_3: @ %vector.body
|
|
; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
; CHECK-NEXT: add.w r10, r3, r5
|
|
; CHECK-NEXT: vctp.32 r12
|
|
; CHECK-NEXT: vpsttt
|
|
; CHECK-NEXT: vldrwt.u32 q7, [r1], #16
|
|
; CHECK-NEXT: vldrwt.u32 q0, [r3], #16
|
|
; CHECK-NEXT: vfmat.f32 q5, q0, q7
|
|
; CHECK-NEXT: add.w r11, r10, r5
|
|
; CHECK-NEXT: vpstt
|
|
; CHECK-NEXT: vldrwt.u32 q0, [r10]
|
|
; CHECK-NEXT: vfmat.f32 q6, q0, q7
|
|
; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill
|
|
; CHECK-NEXT: vpstt
|
|
; CHECK-NEXT: vldrwt.u32 q0, [r11]
|
|
; CHECK-NEXT: vfmat.f32 q1, q0, q7
|
|
; CHECK-NEXT: add.w r6, r11, r5
|
|
; CHECK-NEXT: vmov q6, q5
|
|
; CHECK-NEXT: vmov q5, q4
|
|
; CHECK-NEXT: vmov q4, q2
|
|
; CHECK-NEXT: vmov q2, q3
|
|
; CHECK-NEXT: vpst
|
|
; CHECK-NEXT: vldrwt.u32 q0, [r6]
|
|
; CHECK-NEXT: vmov q3, q1
|
|
; CHECK-NEXT: vldrw.u32 q1, [sp, #64] @ 16-byte Reload
|
|
; CHECK-NEXT: adds r7, r6, r5
|
|
; CHECK-NEXT: vpst
|
|
; CHECK-NEXT: vfmat.f32 q1, q0, q7
|
|
; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill
|
|
; CHECK-NEXT: vmov q1, q3
|
|
; CHECK-NEXT: vmov q3, q2
|
|
; CHECK-NEXT: vmov q2, q4
|
|
; CHECK-NEXT: vmov q4, q5
|
|
; CHECK-NEXT: vmov q5, q6
|
|
; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload
|
|
; CHECK-NEXT: sub.w r12, r12, #4
|
|
; CHECK-NEXT: adds r6, r7, r5
|
|
; CHECK-NEXT: vpstt
|
|
; CHECK-NEXT: vldrwt.u32 q0, [r7]
|
|
; CHECK-NEXT: vfmat.f32 q3, q0, q7
|
|
; CHECK-NEXT: adds r7, r6, r5
|
|
; CHECK-NEXT: vpstttt
|
|
; CHECK-NEXT: vldrwt.u32 q0, [r6]
|
|
; CHECK-NEXT: vfmat.f32 q4, q0, q7
|
|
; CHECK-NEXT: vldrwt.u32 q0, [r7]
|
|
; CHECK-NEXT: vfmat.f32 q2, q0, q7
|
|
; CHECK-NEXT: le lr, .LBB6_3
|
|
; CHECK-NEXT: @ %bb.4: @ %middle.block
|
|
; CHECK-NEXT: @ in Loop: Header=BB6_2 Depth=1
|
|
; CHECK-NEXT: vadd.f32 s0, s26, s27
|
|
; CHECK-NEXT: add.w r1, r2, r8, lsl #2
|
|
; CHECK-NEXT: vadd.f32 s2, s24, s25
|
|
; CHECK-NEXT: vadd.f32 s3, s20, s21
|
|
; CHECK-NEXT: vadd.f32 s1, s22, s23
|
|
; CHECK-NEXT: vadd.f32 s8, s8, s9
|
|
; CHECK-NEXT: vadd.f32 s20, s10, s11
|
|
; CHECK-NEXT: vadd.f32 s11, s14, s15
|
|
; CHECK-NEXT: vadd.f32 s12, s12, s13
|
|
; CHECK-NEXT: vadd.f32 s14, s6, s7
|
|
; CHECK-NEXT: vadd.f32 s4, s4, s5
|
|
; CHECK-NEXT: vadd.f32 s0, s2, s0
|
|
; CHECK-NEXT: vadd.f32 s10, s18, s19
|
|
; CHECK-NEXT: vadd.f32 s9, s16, s17
|
|
; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload
|
|
; CHECK-NEXT: vadd.f32 s2, s3, s1
|
|
; CHECK-NEXT: vadd.f32 s6, s18, s19
|
|
; CHECK-NEXT: vadd.f32 s5, s16, s17
|
|
; CHECK-NEXT: vadd.f32 s4, s4, s14
|
|
; CHECK-NEXT: vstr s0, [r1]
|
|
; CHECK-NEXT: add.w r1, r2, r0, lsl #2
|
|
; CHECK-NEXT: vadd.f32 s12, s12, s11
|
|
; CHECK-NEXT: adds r0, #7
|
|
; CHECK-NEXT: vadd.f32 s10, s9, s10
|
|
; CHECK-NEXT: vstr s2, [r1]
|
|
; CHECK-NEXT: add.w r1, r2, r4, lsl #2
|
|
; CHECK-NEXT: vadd.f32 s8, s8, s20
|
|
; CHECK-NEXT: vadd.f32 s6, s5, s6
|
|
; CHECK-NEXT: vstr s4, [r1]
|
|
; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
; CHECK-NEXT: vstr s6, [r1]
|
|
; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
; CHECK-NEXT: vstr s12, [r1]
|
|
; CHECK-NEXT: ldr r1, [sp, #40] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
; CHECK-NEXT: vstr s10, [r1]
|
|
; CHECK-NEXT: ldr r1, [sp, #44] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
; CHECK-NEXT: vstr s8, [r1]
|
|
; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
|
|
; CHECK-NEXT: add r9, r1
|
|
; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
|
|
; CHECK-NEXT: cmp r0, r1
|
|
; CHECK-NEXT: blo.w .LBB6_2
|
|
; CHECK-NEXT: .LBB6_5: @ %for.cond.cleanup
|
|
; CHECK-NEXT: add sp, #88
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
; CHECK-NEXT: add sp, #4
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
entry:
|
|
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
|
|
%0 = load i32, i32* %NumInputs, align 4
|
|
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
|
|
%1 = load i32, i32* %NumFilters, align 4
|
|
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
|
|
%2 = load float*, float** %pDCTCoefs, align 4
|
|
%cmp = icmp ugt i32 %0, 1
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%sub = add i32 %1, -7
|
|
%cmp3176 = icmp ugt i32 %sub, 1
|
|
br i1 %cmp3176, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%n.rnd.up = add i32 %0, 3
|
|
%n.vec = and i32 %n.rnd.up, -4
|
|
br label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %middle.block, %entry
|
|
ret void
|
|
|
|
for.body: ; preds = %for.body.preheader, %middle.block
|
|
%k2.0177 = phi i32 [ %add70, %middle.block ], [ 1, %for.body.preheader ]
|
|
%mul4 = mul i32 %k2.0177, %0
|
|
%add = add nuw i32 %k2.0177, 1
|
|
%mul5 = mul i32 %add, %0
|
|
%add6 = add i32 %k2.0177, 2
|
|
%mul7 = mul i32 %add6, %0
|
|
%add8 = add i32 %k2.0177, 3
|
|
%mul9 = mul i32 %add8, %0
|
|
%add10 = add i32 %k2.0177, 4
|
|
%mul11 = mul i32 %add10, %0
|
|
%add12 = add i32 %k2.0177, 5
|
|
%mul13 = mul i32 %add12, %0
|
|
%add14 = add i32 %k2.0177, 6
|
|
%mul15 = mul i32 %add14, %0
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %for.body
|
|
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %40, %vector.body ]
|
|
%vec.phi178 = phi <4 x float> [ zeroinitializer, %for.body ], [ %41, %vector.body ]
|
|
%vec.phi179 = phi <4 x float> [ zeroinitializer, %for.body ], [ %42, %vector.body ]
|
|
%vec.phi180 = phi <4 x float> [ zeroinitializer, %for.body ], [ %43, %vector.body ]
|
|
%vec.phi181 = phi <4 x float> [ zeroinitializer, %for.body ], [ %44, %vector.body ]
|
|
%vec.phi182 = phi <4 x float> [ zeroinitializer, %for.body ], [ %45, %vector.body ]
|
|
%vec.phi183 = phi <4 x float> [ zeroinitializer, %for.body ], [ %46, %vector.body ]
|
|
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
|
|
%3 = getelementptr inbounds float, float* %pIn, i32 %index
|
|
%4 = bitcast float* %3 to <4 x float>*
|
|
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%5 = add i32 %index, %mul4
|
|
%6 = getelementptr inbounds float, float* %2, i32 %5
|
|
%7 = bitcast float* %6 to <4 x float>*
|
|
%wide.masked.load184 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%8 = fmul fast <4 x float> %wide.masked.load184, %wide.masked.load
|
|
%9 = fadd fast <4 x float> %8, %vec.phi179
|
|
%10 = add i32 %index, %mul5
|
|
%11 = getelementptr inbounds float, float* %2, i32 %10
|
|
%12 = bitcast float* %11 to <4 x float>*
|
|
%wide.masked.load185 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%13 = fmul fast <4 x float> %wide.masked.load185, %wide.masked.load
|
|
%14 = fadd fast <4 x float> %13, %vec.phi181
|
|
%15 = add i32 %index, %mul7
|
|
%16 = getelementptr inbounds float, float* %2, i32 %15
|
|
%17 = bitcast float* %16 to <4 x float>*
|
|
%wide.masked.load186 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%18 = fmul fast <4 x float> %wide.masked.load186, %wide.masked.load
|
|
%19 = fadd fast <4 x float> %18, %vec.phi183
|
|
%20 = add i32 %index, %mul9
|
|
%21 = getelementptr inbounds float, float* %2, i32 %20
|
|
%22 = bitcast float* %21 to <4 x float>*
|
|
%wide.masked.load187 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%23 = fmul fast <4 x float> %wide.masked.load187, %wide.masked.load
|
|
%24 = fadd fast <4 x float> %23, %vec.phi182
|
|
%25 = add i32 %index, %mul11
|
|
%26 = getelementptr inbounds float, float* %2, i32 %25
|
|
%27 = bitcast float* %26 to <4 x float>*
|
|
%wide.masked.load188 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%28 = fmul fast <4 x float> %wide.masked.load188, %wide.masked.load
|
|
%29 = fadd fast <4 x float> %28, %vec.phi180
|
|
%30 = add i32 %index, %mul13
|
|
%31 = getelementptr inbounds float, float* %2, i32 %30
|
|
%32 = bitcast float* %31 to <4 x float>*
|
|
%wide.masked.load189 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%33 = fmul fast <4 x float> %wide.masked.load189, %wide.masked.load
|
|
%34 = fadd fast <4 x float> %33, %vec.phi178
|
|
%35 = add i32 %index, %mul15
|
|
%36 = getelementptr inbounds float, float* %2, i32 %35
|
|
%37 = bitcast float* %36 to <4 x float>*
|
|
%wide.masked.load190 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %37, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%38 = fmul fast <4 x float> %wide.masked.load190, %wide.masked.load
|
|
%39 = fadd fast <4 x float> %38, %vec.phi
|
|
%40 = select <4 x i1> %active.lane.mask, <4 x float> %39, <4 x float> %vec.phi
|
|
%41 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi178
|
|
%42 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi179
|
|
%43 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi180
|
|
%44 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi181
|
|
%45 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi182
|
|
%46 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi183
|
|
%index.next = add i32 %index, 4
|
|
%47 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %47, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%48 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %46)
|
|
%49 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %45)
|
|
%50 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %44)
|
|
%51 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %43)
|
|
%52 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %42)
|
|
%53 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %41)
|
|
%54 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %40)
|
|
%arrayidx56 = getelementptr inbounds float, float* %pOut, i32 %k2.0177
|
|
store float %52, float* %arrayidx56, align 4
|
|
%arrayidx58 = getelementptr inbounds float, float* %pOut, i32 %add
|
|
store float %50, float* %arrayidx58, align 4
|
|
%arrayidx60 = getelementptr inbounds float, float* %pOut, i32 %add6
|
|
store float %48, float* %arrayidx60, align 4
|
|
%arrayidx62 = getelementptr inbounds float, float* %pOut, i32 %add8
|
|
store float %49, float* %arrayidx62, align 4
|
|
%arrayidx64 = getelementptr inbounds float, float* %pOut, i32 %add10
|
|
store float %51, float* %arrayidx64, align 4
|
|
%arrayidx66 = getelementptr inbounds float, float* %pOut, i32 %add12
|
|
store float %53, float* %arrayidx66, align 4
|
|
%arrayidx68 = getelementptr inbounds float, float* %pOut, i32 %add14
|
|
store float %54, float* %arrayidx68, align 4
|
|
%add70 = add i32 %k2.0177, 7
|
|
%cmp3 = icmp ult i32 %add70, %sub
|
|
br i1 %cmp3, label %for.body, label %for.cond.cleanup
|
|
}
|
|
|
|
define void @DCT_mve8(%struct.DCT_InstanceTypeDef* nocapture readonly %S, float* nocapture readonly %pIn, float* nocapture %pOut) {
|
|
; CHECK-LABEL: DCT_mve8:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: .pad #4
|
|
; CHECK-NEXT: sub sp, #4
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
; CHECK-NEXT: .pad #104
|
|
; CHECK-NEXT: sub sp, #104
|
|
; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
|
|
; CHECK-NEXT: ldr r1, [r0, #4]
|
|
; CHECK-NEXT: subs r1, #8
|
|
; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
|
|
; CHECK-NEXT: cmp r1, #2
|
|
; CHECK-NEXT: blo.w .LBB7_5
|
|
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
|
|
; CHECK-NEXT: ldr r3, [r0, #8]
|
|
; CHECK-NEXT: ldr r1, [r0]
|
|
; CHECK-NEXT: adds r0, r3, #3
|
|
; CHECK-NEXT: str r3, [sp, #20] @ 4-byte Spill
|
|
; CHECK-NEXT: bic r0, r0, #3
|
|
; CHECK-NEXT: add.w r12, r1, r3, lsl #2
|
|
; CHECK-NEXT: subs r1, r0, #4
|
|
; CHECK-NEXT: movs r0, #1
|
|
; CHECK-NEXT: lsls r5, r3, #2
|
|
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
|
|
; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
|
|
; CHECK-NEXT: lsls r1, r3, #5
|
|
; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
|
|
; CHECK-NEXT: .LBB7_2: @ %for.body
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
; CHECK-NEXT: @ Child Loop BB7_3 Depth 2
|
|
; CHECK-NEXT: adds r1, r0, #7
|
|
; CHECK-NEXT: str r1, [sp, #44] @ 4-byte Spill
|
|
; CHECK-NEXT: adds r1, r0, #6
|
|
; CHECK-NEXT: str r1, [sp, #40] @ 4-byte Spill
|
|
; CHECK-NEXT: adds r1, r0, #5
|
|
; CHECK-NEXT: ldr r7, [sp, #20] @ 4-byte Reload
|
|
; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill
|
|
; CHECK-NEXT: adds r1, r0, #4
|
|
; CHECK-NEXT: ldr.w r9, [sp, #28] @ 4-byte Reload
|
|
; CHECK-NEXT: vmov.i32 q3, #0x0
|
|
; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload
|
|
; CHECK-NEXT: adds r4, r0, #3
|
|
; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill
|
|
; CHECK-NEXT: add.w r8, r0, #2
|
|
; CHECK-NEXT: adds r1, r0, #1
|
|
; CHECK-NEXT: mov r3, r12
|
|
; CHECK-NEXT: vmov q5, q3
|
|
; CHECK-NEXT: vmov q6, q3
|
|
; CHECK-NEXT: vmov q4, q3
|
|
; CHECK-NEXT: vmov q7, q3
|
|
; CHECK-NEXT: vmov q2, q3
|
|
; CHECK-NEXT: mov r10, r7
|
|
; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill
|
|
; CHECK-NEXT: vstrw.32 q3, [sp, #80] @ 16-byte Spill
|
|
; CHECK-NEXT: dls lr, r6
|
|
; CHECK-NEXT: .LBB7_3: @ %vector.body
|
|
; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
; CHECK-NEXT: add.w r11, r3, r5
|
|
; CHECK-NEXT: vctp.32 r10
|
|
; CHECK-NEXT: vpsttt
|
|
; CHECK-NEXT: vldrwt.u32 q0, [r9], #16
|
|
; CHECK-NEXT: vldrwt.u32 q1, [r3], #16
|
|
; CHECK-NEXT: vfmat.f32 q6, q1, q0
|
|
; CHECK-NEXT: vstrw.32 q6, [sp, #48] @ 16-byte Spill
|
|
; CHECK-NEXT: vpstt
|
|
; CHECK-NEXT: vldrwt.u32 q1, [r11]
|
|
; CHECK-NEXT: vfmat.f32 q7, q1, q0
|
|
; CHECK-NEXT: add.w r6, r11, r5
|
|
; CHECK-NEXT: vmov q6, q5
|
|
; CHECK-NEXT: vmov q5, q3
|
|
; CHECK-NEXT: vmov q3, q4
|
|
; CHECK-NEXT: vpst
|
|
; CHECK-NEXT: vldrwt.u32 q1, [r6]
|
|
; CHECK-NEXT: vmov q4, q2
|
|
; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload
|
|
; CHECK-NEXT: vpst
|
|
; CHECK-NEXT: vfmat.f32 q2, q1, q0
|
|
; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill
|
|
; CHECK-NEXT: adds r7, r6, r5
|
|
; CHECK-NEXT: vpst
|
|
; CHECK-NEXT: vldrwt.u32 q1, [r7]
|
|
; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload
|
|
; CHECK-NEXT: adds r6, r7, r5
|
|
; CHECK-NEXT: vpst
|
|
; CHECK-NEXT: vfmat.f32 q2, q1, q0
|
|
; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill
|
|
; CHECK-NEXT: vmov q2, q4
|
|
; CHECK-NEXT: vmov q4, q3
|
|
; CHECK-NEXT: vmov q3, q5
|
|
; CHECK-NEXT: vmov q5, q6
|
|
; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload
|
|
; CHECK-NEXT: adds r7, r6, r5
|
|
; CHECK-NEXT: vpstt
|
|
; CHECK-NEXT: vldrwt.u32 q1, [r6]
|
|
; CHECK-NEXT: vfmat.f32 q2, q1, q0
|
|
; CHECK-NEXT: sub.w r10, r10, #4
|
|
; CHECK-NEXT: adds r6, r7, r5
|
|
; CHECK-NEXT: vpstttt
|
|
; CHECK-NEXT: vldrwt.u32 q1, [r7]
|
|
; CHECK-NEXT: vfmat.f32 q4, q1, q0
|
|
; CHECK-NEXT: vldrwt.u32 q1, [r6]
|
|
; CHECK-NEXT: vfmat.f32 q5, q1, q0
|
|
; CHECK-NEXT: add r6, r5
|
|
; CHECK-NEXT: vpstt
|
|
; CHECK-NEXT: vldrwt.u32 q1, [r6]
|
|
; CHECK-NEXT: vfmat.f32 q3, q1, q0
|
|
; CHECK-NEXT: le lr, .LBB7_3
|
|
; CHECK-NEXT: @ %bb.4: @ %middle.block
|
|
; CHECK-NEXT: @ in Loop: Header=BB7_2 Depth=1
|
|
; CHECK-NEXT: vadd.f32 s0, s30, s31
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
; CHECK-NEXT: vadd.f32 s2, s28, s29
|
|
; CHECK-NEXT: vadd.f32 s12, s12, s13
|
|
; CHECK-NEXT: vadd.f32 s5, s14, s15
|
|
; CHECK-NEXT: vadd.f32 s4, s26, s27
|
|
; CHECK-NEXT: vadd.f32 s6, s24, s25
|
|
; CHECK-NEXT: vadd.f32 s14, s18, s19
|
|
; CHECK-NEXT: vadd.f32 s7, s16, s17
|
|
; CHECK-NEXT: vldrw.u32 q4, [sp, #64] @ 16-byte Reload
|
|
; CHECK-NEXT: vadd.f32 s8, s8, s9
|
|
; CHECK-NEXT: vadd.f32 s13, s10, s11
|
|
; CHECK-NEXT: vadd.f32 s10, s18, s19
|
|
; CHECK-NEXT: vadd.f32 s9, s16, s17
|
|
; CHECK-NEXT: vldrw.u32 q4, [sp, #80] @ 16-byte Reload
|
|
; CHECK-NEXT: vadd.f32 s0, s2, s0
|
|
; CHECK-NEXT: vadd.f32 s11, s18, s19
|
|
; CHECK-NEXT: vadd.f32 s15, s16, s17
|
|
; CHECK-NEXT: vadd.f32 s2, s6, s4
|
|
; CHECK-NEXT: vadd.f32 s6, s12, s5
|
|
; CHECK-NEXT: vadd.f32 s12, s7, s14
|
|
; CHECK-NEXT: vadd.f32 s10, s9, s10
|
|
; CHECK-NEXT: vstr s0, [r1]
|
|
; CHECK-NEXT: add.w r1, r2, r0, lsl #2
|
|
; CHECK-NEXT: vadd.f32 s8, s8, s13
|
|
; CHECK-NEXT: adds r0, #8
|
|
; CHECK-NEXT: vadd.f32 s14, s15, s11
|
|
; CHECK-NEXT: vstr s2, [r1]
|
|
; CHECK-NEXT: add.w r1, r2, r8, lsl #2
|
|
; CHECK-NEXT: vadd.f32 s1, s22, s23
|
|
; CHECK-NEXT: vadd.f32 s3, s20, s21
|
|
; CHECK-NEXT: vstr s10, [r1]
|
|
; CHECK-NEXT: add.w r1, r2, r4, lsl #2
|
|
; CHECK-NEXT: vstr s14, [r1]
|
|
; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload
|
|
; CHECK-NEXT: vadd.f32 s4, s3, s1
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
; CHECK-NEXT: vstr s8, [r1]
|
|
; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
; CHECK-NEXT: vstr s12, [r1]
|
|
; CHECK-NEXT: ldr r1, [sp, #40] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
; CHECK-NEXT: vstr s4, [r1]
|
|
; CHECK-NEXT: ldr r1, [sp, #44] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
|
|
; CHECK-NEXT: vstr s6, [r1]
|
|
; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
|
|
; CHECK-NEXT: add r12, r1
|
|
; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
|
|
; CHECK-NEXT: cmp r0, r1
|
|
; CHECK-NEXT: blo.w .LBB7_2
|
|
; CHECK-NEXT: .LBB7_5: @ %for.cond.cleanup
|
|
; CHECK-NEXT: add sp, #104
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
|
|
; CHECK-NEXT: add sp, #4
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
entry:
|
|
%NumInputs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 2
|
|
%0 = load i32, i32* %NumInputs, align 4
|
|
%NumFilters = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 1
|
|
%1 = load i32, i32* %NumFilters, align 4
|
|
%pDCTCoefs = getelementptr inbounds %struct.DCT_InstanceTypeDef, %struct.DCT_InstanceTypeDef* %S, i32 0, i32 0
|
|
%2 = load float*, float** %pDCTCoefs, align 4
|
|
%cmp = icmp ugt i32 %0, 1
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%sub = add i32 %1, -8
|
|
%cmp3197 = icmp ugt i32 %sub, 1
|
|
br i1 %cmp3197, label %for.body.preheader, label %for.cond.cleanup
|
|
|
|
for.body.preheader: ; preds = %entry
|
|
%n.rnd.up = add i32 %0, 3
|
|
%n.vec = and i32 %n.rnd.up, -4
|
|
br label %for.body
|
|
|
|
for.cond.cleanup: ; preds = %middle.block, %entry
|
|
ret void
|
|
|
|
for.body: ; preds = %for.body.preheader, %middle.block
|
|
%k2.0198 = phi i32 [ %add79, %middle.block ], [ 1, %for.body.preheader ]
|
|
%mul4 = mul i32 %k2.0198, %0
|
|
%add = add nuw nsw i32 %k2.0198, 1
|
|
%mul5 = mul i32 %add, %0
|
|
%add6 = add nuw nsw i32 %k2.0198, 2
|
|
%mul7 = mul i32 %add6, %0
|
|
%add8 = add nuw nsw i32 %k2.0198, 3
|
|
%mul9 = mul i32 %add8, %0
|
|
%add10 = add nuw nsw i32 %k2.0198, 4
|
|
%mul11 = mul i32 %add10, %0
|
|
%add12 = add nuw nsw i32 %k2.0198, 5
|
|
%mul13 = mul i32 %add12, %0
|
|
%add14 = add nuw nsw i32 %k2.0198, 6
|
|
%mul15 = mul i32 %add14, %0
|
|
%add16 = add i32 %k2.0198, 7
|
|
%mul17 = mul i32 %add16, %0
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %for.body
|
|
%index = phi i32 [ 0, %for.body ], [ %index.next, %vector.body ]
|
|
%vec.phi = phi <4 x float> [ zeroinitializer, %for.body ], [ %45, %vector.body ]
|
|
%vec.phi199 = phi <4 x float> [ zeroinitializer, %for.body ], [ %46, %vector.body ]
|
|
%vec.phi200 = phi <4 x float> [ zeroinitializer, %for.body ], [ %47, %vector.body ]
|
|
%vec.phi201 = phi <4 x float> [ zeroinitializer, %for.body ], [ %48, %vector.body ]
|
|
%vec.phi202 = phi <4 x float> [ zeroinitializer, %for.body ], [ %49, %vector.body ]
|
|
%vec.phi203 = phi <4 x float> [ zeroinitializer, %for.body ], [ %50, %vector.body ]
|
|
%vec.phi204 = phi <4 x float> [ zeroinitializer, %for.body ], [ %51, %vector.body ]
|
|
%vec.phi205 = phi <4 x float> [ zeroinitializer, %for.body ], [ %52, %vector.body ]
|
|
%active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %0)
|
|
%3 = getelementptr inbounds float, float* %pIn, i32 %index
|
|
%4 = bitcast float* %3 to <4 x float>*
|
|
%wide.masked.load = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %4, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%5 = add i32 %index, %mul4
|
|
%6 = getelementptr inbounds float, float* %2, i32 %5
|
|
%7 = bitcast float* %6 to <4 x float>*
|
|
%wide.masked.load206 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %7, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%8 = fmul fast <4 x float> %wide.masked.load206, %wide.masked.load
|
|
%9 = fadd fast <4 x float> %8, %vec.phi200
|
|
%10 = add i32 %index, %mul5
|
|
%11 = getelementptr inbounds float, float* %2, i32 %10
|
|
%12 = bitcast float* %11 to <4 x float>*
|
|
%wide.masked.load207 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %12, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%13 = fmul fast <4 x float> %wide.masked.load207, %wide.masked.load
|
|
%14 = fadd fast <4 x float> %13, %vec.phi202
|
|
%15 = add i32 %index, %mul7
|
|
%16 = getelementptr inbounds float, float* %2, i32 %15
|
|
%17 = bitcast float* %16 to <4 x float>*
|
|
%wide.masked.load208 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %17, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%18 = fmul fast <4 x float> %wide.masked.load208, %wide.masked.load
|
|
%19 = fadd fast <4 x float> %18, %vec.phi204
|
|
%20 = add i32 %index, %mul9
|
|
%21 = getelementptr inbounds float, float* %2, i32 %20
|
|
%22 = bitcast float* %21 to <4 x float>*
|
|
%wide.masked.load209 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %22, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%23 = fmul fast <4 x float> %wide.masked.load209, %wide.masked.load
|
|
%24 = fadd fast <4 x float> %23, %vec.phi205
|
|
%25 = add i32 %index, %mul11
|
|
%26 = getelementptr inbounds float, float* %2, i32 %25
|
|
%27 = bitcast float* %26 to <4 x float>*
|
|
%wide.masked.load210 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %27, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%28 = fmul fast <4 x float> %wide.masked.load210, %wide.masked.load
|
|
%29 = fadd fast <4 x float> %28, %vec.phi203
|
|
%30 = add i32 %index, %mul13
|
|
%31 = getelementptr inbounds float, float* %2, i32 %30
|
|
%32 = bitcast float* %31 to <4 x float>*
|
|
%wide.masked.load211 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %32, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%33 = fmul fast <4 x float> %wide.masked.load211, %wide.masked.load
|
|
%34 = fadd fast <4 x float> %33, %vec.phi201
|
|
%35 = add i32 %index, %mul15
|
|
%36 = getelementptr inbounds float, float* %2, i32 %35
|
|
%37 = bitcast float* %36 to <4 x float>*
|
|
%wide.masked.load212 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %37, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%38 = fmul fast <4 x float> %wide.masked.load212, %wide.masked.load
|
|
%39 = fadd fast <4 x float> %38, %vec.phi199
|
|
%40 = add i32 %index, %mul17
|
|
%41 = getelementptr inbounds float, float* %2, i32 %40
|
|
%42 = bitcast float* %41 to <4 x float>*
|
|
%wide.masked.load213 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %42, i32 4, <4 x i1> %active.lane.mask, <4 x float> undef)
|
|
%43 = fmul fast <4 x float> %wide.masked.load213, %wide.masked.load
|
|
%44 = fadd fast <4 x float> %43, %vec.phi
|
|
%45 = select <4 x i1> %active.lane.mask, <4 x float> %44, <4 x float> %vec.phi
|
|
%46 = select <4 x i1> %active.lane.mask, <4 x float> %39, <4 x float> %vec.phi199
|
|
%47 = select <4 x i1> %active.lane.mask, <4 x float> %9, <4 x float> %vec.phi200
|
|
%48 = select <4 x i1> %active.lane.mask, <4 x float> %34, <4 x float> %vec.phi201
|
|
%49 = select <4 x i1> %active.lane.mask, <4 x float> %14, <4 x float> %vec.phi202
|
|
%50 = select <4 x i1> %active.lane.mask, <4 x float> %29, <4 x float> %vec.phi203
|
|
%51 = select <4 x i1> %active.lane.mask, <4 x float> %19, <4 x float> %vec.phi204
|
|
%52 = select <4 x i1> %active.lane.mask, <4 x float> %24, <4 x float> %vec.phi205
|
|
%index.next = add i32 %index, 4
|
|
%53 = icmp eq i32 %index.next, %n.vec
|
|
br i1 %53, label %middle.block, label %vector.body
|
|
|
|
middle.block: ; preds = %vector.body
|
|
%54 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %52)
|
|
%55 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %51)
|
|
%56 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %50)
|
|
%57 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %49)
|
|
%58 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %48)
|
|
%59 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %47)
|
|
%60 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %46)
|
|
%61 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> %45)
|
|
%arrayidx63 = getelementptr inbounds float, float* %pOut, i32 %k2.0198
|
|
store float %59, float* %arrayidx63, align 4
|
|
%arrayidx65 = getelementptr inbounds float, float* %pOut, i32 %add
|
|
store float %57, float* %arrayidx65, align 4
|
|
%arrayidx67 = getelementptr inbounds float, float* %pOut, i32 %add6
|
|
store float %55, float* %arrayidx67, align 4
|
|
%arrayidx69 = getelementptr inbounds float, float* %pOut, i32 %add8
|
|
store float %54, float* %arrayidx69, align 4
|
|
%arrayidx71 = getelementptr inbounds float, float* %pOut, i32 %add10
|
|
store float %56, float* %arrayidx71, align 4
|
|
%arrayidx73 = getelementptr inbounds float, float* %pOut, i32 %add12
|
|
store float %58, float* %arrayidx73, align 4
|
|
%arrayidx75 = getelementptr inbounds float, float* %pOut, i32 %add14
|
|
store float %60, float* %arrayidx75, align 4
|
|
%arrayidx77 = getelementptr inbounds float, float* %pOut, i32 %add16
|
|
store float %61, float* %arrayidx77, align 4
|
|
%add79 = add i32 %k2.0198, 8
|
|
%cmp3 = icmp ult i32 %add79, %sub
|
|
br i1 %cmp3, label %for.body, label %for.cond.cleanup
|
|
}
|
|
|
|
declare void @llvm.assume(i1 noundef)
|
|
declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
|
|
declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>)
|
|
declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
|