mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-02-01 05:01:59 +01:00
0b35e38c5b
Some MVE floating point instructions have gpr register variants that take the scalar gpr value and splat them to all lanes. In order to accept them in loops, the shuffle_vector and insert need to be sunk down into the loop, next to the instruction so that ISel can see the whole pattern. This does that sinking for FAdd, FSub, FMul and FCmp. The patterns for mul are slightly more constrained as there are no fms variants taking register arguments. Differential Revision: https://reviews.llvm.org/D76023
1469 lines
66 KiB
LLVM
1469 lines
66 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
|
|
|
|
define arm_aapcs_vfpcc void @test_fadd(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) {
|
|
; CHECK-LABEL: test_fadd:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: vldr.16 s0, [r1]
|
|
; CHECK-NEXT: vmov.f16 r1, s0
|
|
; CHECK-NEXT: .LBB0_1: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: subs r3, #8
|
|
; CHECK-NEXT: vadd.f16 q0, q0, r1
|
|
; CHECK-NEXT: vstrb.8 q0, [r2], #16
|
|
; CHECK-NEXT: bne .LBB0_1
|
|
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%B = load half, half* %BB
|
|
%0 = and i32 %n, 7
|
|
%cmp = icmp eq i32 %0, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
|
|
%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
%3 = fadd fast <8 x half> %wide.load, %broadcast.splat11
|
|
%4 = getelementptr inbounds half, half* %C, i32 %index
|
|
%5 = bitcast half* %4 to <8 x half>*
|
|
store <8 x half> %3, <8 x half>* %5, align 4
|
|
%index.next = add i32 %index, 8
|
|
%6 = icmp eq i32 %index.next, %n
|
|
br i1 %6, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fadd_r(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) {
|
|
; CHECK-LABEL: test_fadd_r:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: vldr.16 s0, [r1]
|
|
; CHECK-NEXT: vmov.f16 r1, s0
|
|
; CHECK-NEXT: .LBB1_1: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: subs r3, #8
|
|
; CHECK-NEXT: vadd.f16 q0, q0, r1
|
|
; CHECK-NEXT: vstrb.8 q0, [r2], #16
|
|
; CHECK-NEXT: bne .LBB1_1
|
|
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%B = load half, half* %BB
|
|
%0 = and i32 %n, 7
|
|
%cmp = icmp eq i32 %0, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
|
|
%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
%3 = fadd fast <8 x half> %broadcast.splat11, %wide.load
|
|
%4 = getelementptr inbounds half, half* %C, i32 %index
|
|
%5 = bitcast half* %4 to <8 x half>*
|
|
store <8 x half> %3, <8 x half>* %5, align 4
|
|
%index.next = add i32 %index, 8
|
|
%6 = icmp eq i32 %index.next, %n
|
|
br i1 %6, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fmul(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) {
|
|
; CHECK-LABEL: test_fmul:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: vldr.16 s0, [r1]
|
|
; CHECK-NEXT: vmov.f16 r1, s0
|
|
; CHECK-NEXT: .LBB2_1: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: subs r3, #8
|
|
; CHECK-NEXT: vmul.f16 q0, q0, r1
|
|
; CHECK-NEXT: vstrb.8 q0, [r2], #16
|
|
; CHECK-NEXT: bne .LBB2_1
|
|
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%B = load half, half* %BB
|
|
%0 = and i32 %n, 7
|
|
%cmp = icmp eq i32 %0, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
|
|
%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
%3 = fmul fast <8 x half> %wide.load, %broadcast.splat11
|
|
%4 = getelementptr inbounds half, half* %C, i32 %index
|
|
%5 = bitcast half* %4 to <8 x half>*
|
|
store <8 x half> %3, <8 x half>* %5, align 4
|
|
%index.next = add i32 %index, 8
|
|
%6 = icmp eq i32 %index.next, %n
|
|
br i1 %6, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fmul_r(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) {
|
|
; CHECK-LABEL: test_fmul_r:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: vldr.16 s0, [r1]
|
|
; CHECK-NEXT: vmov.f16 r1, s0
|
|
; CHECK-NEXT: .LBB3_1: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: subs r3, #8
|
|
; CHECK-NEXT: vmul.f16 q0, q0, r1
|
|
; CHECK-NEXT: vstrb.8 q0, [r2], #16
|
|
; CHECK-NEXT: bne .LBB3_1
|
|
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%B = load half, half* %BB
|
|
%0 = and i32 %n, 7
|
|
%cmp = icmp eq i32 %0, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
|
|
%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
%3 = fmul fast <8 x half> %broadcast.splat11, %wide.load
|
|
%4 = getelementptr inbounds half, half* %C, i32 %index
|
|
%5 = bitcast half* %4 to <8 x half>*
|
|
store <8 x half> %3, <8 x half>* %5, align 4
|
|
%index.next = add i32 %index, 8
|
|
%6 = icmp eq i32 %index.next, %n
|
|
br i1 %6, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fsub(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) {
|
|
; CHECK-LABEL: test_fsub:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: vldr.16 s0, [r1]
|
|
; CHECK-NEXT: vmov.f16 r1, s0
|
|
; CHECK-NEXT: .LBB4_1: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: subs r3, #8
|
|
; CHECK-NEXT: vsub.f16 q0, q0, r1
|
|
; CHECK-NEXT: vstrb.8 q0, [r2], #16
|
|
; CHECK-NEXT: bne .LBB4_1
|
|
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%B = load half, half* %BB
|
|
%0 = and i32 %n, 7
|
|
%cmp = icmp eq i32 %0, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
|
|
%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
%3 = fsub fast <8 x half> %wide.load, %broadcast.splat11
|
|
%4 = getelementptr inbounds half, half* %C, i32 %index
|
|
%5 = bitcast half* %4 to <8 x half>*
|
|
store <8 x half> %3, <8 x half>* %5, align 4
|
|
%index.next = add i32 %index, 8
|
|
%6 = icmp eq i32 %index.next, %n
|
|
br i1 %6, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fsub_r(half* noalias nocapture readonly %A, half *%BB, half* noalias nocapture %C, i32 %n) {
|
|
; CHECK-LABEL: test_fsub_r:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: cmp r3, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: vldr.16 s0, [r1]
|
|
; CHECK-NEXT: vmov.f16 r1, s0
|
|
; CHECK-NEXT: vdup.16 q0, r1
|
|
; CHECK-NEXT: .LBB5_1: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
|
; CHECK-NEXT: subs r3, #8
|
|
; CHECK-NEXT: vsub.f16 q1, q0, q1
|
|
; CHECK-NEXT: vstrb.8 q1, [r2], #16
|
|
; CHECK-NEXT: bne .LBB5_1
|
|
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%B = load half, half* %BB
|
|
%0 = and i32 %n, 7
|
|
%cmp = icmp eq i32 %0, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
|
|
%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
%3 = fsub fast <8 x half> %broadcast.splat11, %wide.load
|
|
%4 = getelementptr inbounds half, half* %C, i32 %index
|
|
%5 = bitcast half* %4 to <8 x half>*
|
|
store <8 x half> %3, <8 x half>* %5, align 4
|
|
%index.next = add i32 %index, 8
|
|
%6 = icmp eq i32 %index.next, %n
|
|
br i1 %6, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
|
|
define arm_aapcs_vfpcc void @test_fmas(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fmas:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: ldr.w r12, [sp]
|
|
; CHECK-NEXT: cmp.w r12, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: vldr.16 s0, [r2]
|
|
; CHECK-NEXT: vmov.f16 r2, s0
|
|
; CHECK-NEXT: .LBB6_1: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
|
|
; CHECK-NEXT: subs.w r12, r12, #8
|
|
; CHECK-NEXT: vfmas.f16 q1, q0, r2
|
|
; CHECK-NEXT: vstrb.8 q1, [r3], #16
|
|
; CHECK-NEXT: bne .LBB6_1
|
|
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%C = load half, half* %CC
|
|
%0 = and i32 %n, 7
|
|
%cmp = icmp eq i32 %0, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
|
|
%broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
%3 = getelementptr inbounds half, half* %B, i32 %index
|
|
%4 = bitcast half* %3 to <8 x half>*
|
|
%wide.load12 = load <8 x half>, <8 x half>* %4, align 4
|
|
%5 = fmul fast <8 x half> %wide.load12, %wide.load
|
|
%6 = fadd fast <8 x half> %5, %broadcast.splat14
|
|
%7 = getelementptr inbounds half, half* %D, i32 %index
|
|
%8 = bitcast half* %7 to <8 x half>*
|
|
store <8 x half> %6, <8 x half>* %8, align 4
|
|
%index.next = add i32 %index, 8
|
|
%9 = icmp eq i32 %index.next, %n
|
|
br i1 %9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fmas_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fmas_r:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: ldr.w r12, [sp]
|
|
; CHECK-NEXT: cmp.w r12, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: vldr.16 s0, [r2]
|
|
; CHECK-NEXT: vmov.f16 r2, s0
|
|
; CHECK-NEXT: .LBB7_1: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
|
|
; CHECK-NEXT: subs.w r12, r12, #8
|
|
; CHECK-NEXT: vfmas.f16 q1, q0, r2
|
|
; CHECK-NEXT: vstrb.8 q1, [r3], #16
|
|
; CHECK-NEXT: bne .LBB7_1
|
|
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%C = load half, half* %CC
|
|
%0 = and i32 %n, 7
|
|
%cmp = icmp eq i32 %0, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
|
|
%broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
%3 = getelementptr inbounds half, half* %B, i32 %index
|
|
%4 = bitcast half* %3 to <8 x half>*
|
|
%wide.load12 = load <8 x half>, <8 x half>* %4, align 4
|
|
%5 = fmul fast <8 x half> %wide.load12, %wide.load
|
|
%6 = fadd fast <8 x half> %broadcast.splat14, %5
|
|
%7 = getelementptr inbounds half, half* %D, i32 %index
|
|
%8 = bitcast half* %7 to <8 x half>*
|
|
store <8 x half> %6, <8 x half>* %8, align 4
|
|
%index.next = add i32 %index, 8
|
|
%9 = icmp eq i32 %index.next, %n
|
|
br i1 %9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fma(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fma:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: ldr.w r12, [sp]
|
|
; CHECK-NEXT: cmp.w r12, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: vldr.16 s0, [r2]
|
|
; CHECK-NEXT: vmov.f16 r2, s0
|
|
; CHECK-NEXT: .LBB8_1: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
|
|
; CHECK-NEXT: subs.w r12, r12, #8
|
|
; CHECK-NEXT: vfma.f16 q1, q0, r2
|
|
; CHECK-NEXT: vstrb.8 q1, [r3], #16
|
|
; CHECK-NEXT: bne .LBB8_1
|
|
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%C = load half, half* %CC
|
|
%0 = and i32 %n, 7
|
|
%cmp = icmp eq i32 %0, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
|
|
%broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
%3 = fmul fast <8 x half> %wide.load, %broadcast.splat13
|
|
%4 = getelementptr inbounds half, half* %B, i32 %index
|
|
%5 = bitcast half* %4 to <8 x half>*
|
|
%wide.load14 = load <8 x half>, <8 x half>* %5, align 4
|
|
%6 = fadd fast <8 x half> %3, %wide.load14
|
|
%7 = getelementptr inbounds half, half* %D, i32 %index
|
|
%8 = bitcast half* %7 to <8 x half>*
|
|
store <8 x half> %6, <8 x half>* %8, align 4
|
|
%index.next = add i32 %index, 8
|
|
%9 = icmp eq i32 %index.next, %n
|
|
br i1 %9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fma_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fma_r:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: ldr.w r12, [sp]
|
|
; CHECK-NEXT: cmp.w r12, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: vldr.16 s0, [r2]
|
|
; CHECK-NEXT: vmov.f16 r2, s0
|
|
; CHECK-NEXT: .LBB9_1: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
|
|
; CHECK-NEXT: subs.w r12, r12, #8
|
|
; CHECK-NEXT: vfma.f16 q1, q0, r2
|
|
; CHECK-NEXT: vstrb.8 q1, [r3], #16
|
|
; CHECK-NEXT: bne .LBB9_1
|
|
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%C = load half, half* %CC
|
|
%0 = and i32 %n, 7
|
|
%cmp = icmp eq i32 %0, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
|
|
%broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
%3 = fmul fast <8 x half> %broadcast.splat13, %wide.load
|
|
%4 = getelementptr inbounds half, half* %B, i32 %index
|
|
%5 = bitcast half* %4 to <8 x half>*
|
|
%wide.load14 = load <8 x half>, <8 x half>* %5, align 4
|
|
%6 = fadd fast <8 x half> %3, %wide.load14
|
|
%7 = getelementptr inbounds half, half* %D, i32 %index
|
|
%8 = bitcast half* %7 to <8 x half>*
|
|
store <8 x half> %6, <8 x half>* %8, align 4
|
|
%index.next = add i32 %index, 8
|
|
%9 = icmp eq i32 %index.next, %n
|
|
br i1 %9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
|
|
define arm_aapcs_vfpcc void @test_fmss(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fmss:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: ldr.w r12, [sp]
|
|
; CHECK-NEXT: cmp.w r12, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: vldr.16 s0, [r2]
|
|
; CHECK-NEXT: vmov.f16 r2, s0
|
|
; CHECK-NEXT: vdup.16 q0, r2
|
|
; CHECK-NEXT: vneg.f16 q0, q0
|
|
; CHECK-NEXT: .LBB10_1: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
|
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
|
|
; CHECK-NEXT: vmov q3, q0
|
|
; CHECK-NEXT: subs.w r12, r12, #8
|
|
; CHECK-NEXT: vfma.f16 q3, q2, q1
|
|
; CHECK-NEXT: vstrb.8 q3, [r3], #16
|
|
; CHECK-NEXT: bne .LBB10_1
|
|
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%C = load half, half* %CC
|
|
%0 = and i32 %n, 7
|
|
%cmp = icmp eq i32 %0, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
|
|
%broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
%3 = getelementptr inbounds half, half* %B, i32 %index
|
|
%4 = bitcast half* %3 to <8 x half>*
|
|
%wide.load12 = load <8 x half>, <8 x half>* %4, align 4
|
|
%5 = fmul fast <8 x half> %wide.load12, %wide.load
|
|
%6 = fsub fast <8 x half> %5, %broadcast.splat14
|
|
%7 = getelementptr inbounds half, half* %D, i32 %index
|
|
%8 = bitcast half* %7 to <8 x half>*
|
|
store <8 x half> %6, <8 x half>* %8, align 4
|
|
%index.next = add i32 %index, 8
|
|
%9 = icmp eq i32 %index.next, %n
|
|
br i1 %9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fmss_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fmss_r:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: ldr.w r12, [sp]
|
|
; CHECK-NEXT: cmp.w r12, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: vldr.16 s0, [r2]
|
|
; CHECK-NEXT: vmov.f16 r2, s0
|
|
; CHECK-NEXT: vdup.16 q0, r2
|
|
; CHECK-NEXT: .LBB11_1: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
|
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
|
|
; CHECK-NEXT: vmov q3, q0
|
|
; CHECK-NEXT: subs.w r12, r12, #8
|
|
; CHECK-NEXT: vfms.f16 q3, q2, q1
|
|
; CHECK-NEXT: vstrb.8 q3, [r3], #16
|
|
; CHECK-NEXT: bne .LBB11_1
|
|
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%C = load half, half* %CC
|
|
%0 = and i32 %n, 7
|
|
%cmp = icmp eq i32 %0, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
|
|
%broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
%3 = getelementptr inbounds half, half* %B, i32 %index
|
|
%4 = bitcast half* %3 to <8 x half>*
|
|
%wide.load12 = load <8 x half>, <8 x half>* %4, align 4
|
|
%5 = fmul fast <8 x half> %wide.load12, %wide.load
|
|
%6 = fsub fast <8 x half> %broadcast.splat14, %5
|
|
%7 = getelementptr inbounds half, half* %D, i32 %index
|
|
%8 = bitcast half* %7 to <8 x half>*
|
|
store <8 x half> %6, <8 x half>* %8, align 4
|
|
%index.next = add i32 %index, 8
|
|
%9 = icmp eq i32 %index.next, %n
|
|
br i1 %9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fms(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fms:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: ldr.w r12, [sp]
|
|
; CHECK-NEXT: cmp.w r12, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: vldr.16 s0, [r2]
|
|
; CHECK-NEXT: vmov.f16 r2, s0
|
|
; CHECK-NEXT: .LBB12_1: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
|
; CHECK-NEXT: subs.w r12, r12, #8
|
|
; CHECK-NEXT: vneg.f16 q0, q0
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r2
|
|
; CHECK-NEXT: vstrb.8 q0, [r3], #16
|
|
; CHECK-NEXT: bne .LBB12_1
|
|
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%C = load half, half* %CC
|
|
%0 = and i32 %n, 7
|
|
%cmp = icmp eq i32 %0, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
|
|
%broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
%3 = fmul fast <8 x half> %wide.load, %broadcast.splat13
|
|
%4 = getelementptr inbounds half, half* %B, i32 %index
|
|
%5 = bitcast half* %4 to <8 x half>*
|
|
%wide.load14 = load <8 x half>, <8 x half>* %5, align 4
|
|
%6 = fsub fast <8 x half> %3, %wide.load14
|
|
%7 = getelementptr inbounds half, half* %D, i32 %index
|
|
%8 = bitcast half* %7 to <8 x half>*
|
|
store <8 x half> %6, <8 x half>* %8, align 4
|
|
%index.next = add i32 %index, 8
|
|
%9 = icmp eq i32 %index.next, %n
|
|
br i1 %9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
define arm_aapcs_vfpcc void @test_fms_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half *%CC, half* noalias nocapture %D, i32 %n) {
|
|
; CHECK-LABEL: test_fms_r:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: ldr.w r12, [sp]
|
|
; CHECK-NEXT: cmp.w r12, #1
|
|
; CHECK-NEXT: it lt
|
|
; CHECK-NEXT: bxlt lr
|
|
; CHECK-NEXT: vldr.16 s0, [r2]
|
|
; CHECK-NEXT: vmov.f16 r2, s0
|
|
; CHECK-NEXT: .LBB13_1: @ %vector.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
|
; CHECK-NEXT: subs.w r12, r12, #8
|
|
; CHECK-NEXT: vneg.f16 q0, q0
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r2
|
|
; CHECK-NEXT: vstrb.8 q0, [r3], #16
|
|
; CHECK-NEXT: bne .LBB13_1
|
|
; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup
|
|
; CHECK-NEXT: bx lr
|
|
entry:
|
|
%C = load half, half* %CC
|
|
%0 = and i32 %n, 7
|
|
%cmp = icmp eq i32 %0, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
vector.ph: ; preds = %entry
|
|
%broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
|
|
%broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
%3 = fmul fast <8 x half> %broadcast.splat13, %wide.load
|
|
%4 = getelementptr inbounds half, half* %B, i32 %index
|
|
%5 = bitcast half* %4 to <8 x half>*
|
|
%wide.load14 = load <8 x half>, <8 x half>* %5, align 4
|
|
%6 = fsub fast <8 x half> %3, %wide.load14
|
|
%7 = getelementptr inbounds half, half* %D, i32 %index
|
|
%8 = bitcast half* %7 to <8 x half>*
|
|
store <8 x half> %6, <8 x half>* %8, align 4
|
|
%index.next = add i32 %index, 8
|
|
%9 = icmp eq i32 %index.next, %n
|
|
br i1 %9, label %for.cond.cleanup, label %vector.body
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
ret void
|
|
}
|
|
|
|
|
|
define dso_local void @test_nested(half* noalias nocapture %pInT1, half* noalias nocapture readonly %pOutT1, half* noalias nocapture readonly %pPRT_in, half* noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l, half *%ina) local_unnamed_addr #0 {
|
|
; CHECK-LABEL: test_nested:
|
|
; CHECK: @ %bb.0: @ %for.body.us.preheader
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, lr}
|
|
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
|
|
; CHECK-NEXT: ldrd lr, r12, [sp, #20]
|
|
; CHECK-NEXT: lsl.w r3, r12, #1
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: .LBB14_1: @ %for.body.us
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
; CHECK-NEXT: @ Child Loop BB14_2 Depth 2
|
|
; CHECK-NEXT: ldrh r4, [r1]
|
|
; CHECK-NEXT: mov r5, r12
|
|
; CHECK-NEXT: vdup.16 q0, r4
|
|
; CHECK-NEXT: movs r4, #0
|
|
; CHECK-NEXT: .LBB14_2: @ %vector.body
|
|
; CHECK-NEXT: @ Parent Loop BB14_1 Depth=1
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
; CHECK-NEXT: adds r6, r0, r4
|
|
; CHECK-NEXT: adds r7, r2, r4
|
|
; CHECK-NEXT: vldrw.u32 q1, [r7]
|
|
; CHECK-NEXT: vldrw.u32 q2, [r6]
|
|
; CHECK-NEXT: adds r4, #16
|
|
; CHECK-NEXT: subs r5, #8
|
|
; CHECK-NEXT: vfms.f16 q2, q1, q0
|
|
; CHECK-NEXT: vstrw.32 q2, [r6]
|
|
; CHECK-NEXT: bne .LBB14_2
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond6.for.end_crit_edge.us
|
|
; CHECK-NEXT: @ in Loop: Header=BB14_1 Depth=1
|
|
; CHECK-NEXT: add r0, r3
|
|
; CHECK-NEXT: add r2, r3
|
|
; CHECK-NEXT: adds r1, #2
|
|
; CHECK-NEXT: le lr, .LBB14_1
|
|
; CHECK-NEXT: @ %bb.4: @ %for.end14
|
|
; CHECK-NEXT: pop {r4, r5, r6, r7, pc}
|
|
for.body.us.preheader:
|
|
%in = load half, half* %ina
|
|
%cmp = icmp sgt i32 %numRows, 0
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
%cmp1 = icmp sgt i32 %numCols, 0
|
|
tail call void @llvm.assume(i1 %cmp1)
|
|
%rem = and i32 %numCols, 7
|
|
%cmp2 = icmp eq i32 %rem, 0
|
|
tail call void @llvm.assume(i1 %cmp2)
|
|
%cmp3 = icmp slt i32 %l, %numCols
|
|
tail call void @llvm.assume(i1 %cmp3)
|
|
br label %for.body.us
|
|
|
|
for.body.us: ; preds = %for.cond6.for.end_crit_edge.us, %for.body.us.preheader
|
|
%pInT1.addr.038.us = phi half* [ %scevgep40, %for.cond6.for.end_crit_edge.us ], [ %pInT1, %for.body.us.preheader ]
|
|
%i.037.us = phi i32 [ %inc13.us, %for.cond6.for.end_crit_edge.us ], [ 0, %for.body.us.preheader ]
|
|
%pOutT1.addr.036.us = phi half* [ %incdec.ptr.us, %for.cond6.for.end_crit_edge.us ], [ %pOutT1, %for.body.us.preheader ]
|
|
%pPRT_in.addr.035.us = phi half* [ %scevgep, %for.cond6.for.end_crit_edge.us ], [ %pPRT_in, %for.body.us.preheader ]
|
|
%scevgep = getelementptr half, half* %pPRT_in.addr.035.us, i32 %numCols
|
|
%0 = load half, half* %pOutT1.addr.036.us, align 4
|
|
%broadcast.splatinsert47 = insertelement <8 x half> undef, half %0, i32 0
|
|
%broadcast.splat48 = shufflevector <8 x half> %broadcast.splatinsert47, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br label %vector.body
|
|
|
|
vector.body: ; preds = %vector.body, %for.body.us
|
|
%index = phi i32 [ 0, %for.body.us ], [ %index.next, %vector.body ]
|
|
%next.gep = getelementptr half, half* %pInT1.addr.038.us, i32 %index
|
|
%next.gep45 = getelementptr half, half* %pPRT_in.addr.035.us, i32 %index
|
|
%1 = bitcast half* %next.gep to <8 x half>*
|
|
%wide.load = load <8 x half>, <8 x half>* %1, align 4
|
|
%2 = bitcast half* %next.gep45 to <8 x half>*
|
|
%wide.load46 = load <8 x half>, <8 x half>* %2, align 4
|
|
%3 = fmul fast <8 x half> %wide.load46, %broadcast.splat48
|
|
%4 = fsub fast <8 x half> %wide.load, %3
|
|
store <8 x half> %4, <8 x half>* %1, align 4
|
|
%index.next = add i32 %index, 8
|
|
%5 = icmp eq i32 %index.next, %numCols
|
|
br i1 %5, label %for.cond6.for.end_crit_edge.us, label %vector.body
|
|
|
|
for.cond6.for.end_crit_edge.us: ; preds = %vector.body
|
|
%incdec.ptr.us = getelementptr inbounds half, half* %pOutT1.addr.036.us, i32 1
|
|
%scevgep40 = getelementptr half, half* %pInT1.addr.038.us, i32 %numCols
|
|
%inc13.us = add nuw nsw i32 %i.037.us, 1
|
|
%exitcond41 = icmp eq i32 %inc13.us, %numRows
|
|
br i1 %exitcond41, label %for.end14, label %for.body.us
|
|
|
|
for.end14: ; preds = %for.cond6.for.end_crit_edge.us
|
|
ret void
|
|
}
|
|
|
|
%struct.arm_fir_instance_f32 = type { i16, half*, half* }
|
|
define void @arm_fir_f32_1_4_mve(%struct.arm_fir_instance_f32* nocapture readonly %S, half* nocapture readonly %pSrc, half* %pDst, i32 %blockSize) {
|
|
; CHECK-LABEL: arm_fir_f32_1_4_mve:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: .pad #16
|
|
; CHECK-NEXT: sub sp, #16
|
|
; CHECK-NEXT: ldrh r4, [r0]
|
|
; CHECK-NEXT: ldr r5, [r0, #4]
|
|
; CHECK-NEXT: subs r7, r4, #1
|
|
; CHECK-NEXT: cmp r7, #3
|
|
; CHECK-NEXT: bhi .LBB15_6
|
|
; CHECK-NEXT: @ %bb.1: @ %if.then
|
|
; CHECK-NEXT: ldr r6, [r0, #8]
|
|
; CHECK-NEXT: add.w r11, r5, r7, lsl #1
|
|
; CHECK-NEXT: lsr.w lr, r3, #2
|
|
; CHECK-NEXT: vldr.16 s0, [r6, #6]
|
|
; CHECK-NEXT: vldr.16 s2, [r6, #4]
|
|
; CHECK-NEXT: vldr.16 s4, [r6, #2]
|
|
; CHECK-NEXT: vldr.16 s6, [r6]
|
|
; CHECK-NEXT: wls lr, lr, .LBB15_5
|
|
; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
|
|
; CHECK-NEXT: strd r3, r4, [sp, #8] @ 8-byte Folded Spill
|
|
; CHECK-NEXT: vmov.f16 r10, s6
|
|
; CHECK-NEXT: vmov.f16 r12, s4
|
|
; CHECK-NEXT: bic r3, r3, #3
|
|
; CHECK-NEXT: vmov.f16 r4, s2
|
|
; CHECK-NEXT: str r3, [sp] @ 4-byte Spill
|
|
; CHECK-NEXT: vmov.f16 r8, s0
|
|
; CHECK-NEXT: add.w r3, r2, r3, lsl #1
|
|
; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
|
|
; CHECK-NEXT: movs r6, #0
|
|
; CHECK-NEXT: mov r3, r5
|
|
; CHECK-NEXT: .LBB15_3: @ %while.body
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: add.w r9, r1, r6
|
|
; CHECK-NEXT: add.w r7, r11, r6
|
|
; CHECK-NEXT: vldrw.u32 q2, [r9]
|
|
; CHECK-NEXT: vstrw.32 q2, [r7]
|
|
; CHECK-NEXT: adds r7, r3, r6
|
|
; CHECK-NEXT: vldrw.u32 q2, [r7]
|
|
; CHECK-NEXT: adds r5, r7, #2
|
|
; CHECK-NEXT: vldrw.u32 q3, [r5]
|
|
; CHECK-NEXT: adds r5, r7, #6
|
|
; CHECK-NEXT: vmul.f16 q2, q2, r10
|
|
; CHECK-NEXT: vfma.f16 q2, q3, r12
|
|
; CHECK-NEXT: vldrw.u32 q3, [r7, #4]
|
|
; CHECK-NEXT: vfma.f16 q2, q3, r4
|
|
; CHECK-NEXT: vldrw.u32 q3, [r5]
|
|
; CHECK-NEXT: adds r5, r2, r6
|
|
; CHECK-NEXT: adds r6, #8
|
|
; CHECK-NEXT: vfma.f16 q2, q3, r8
|
|
; CHECK-NEXT: vstrw.32 q2, [r5]
|
|
; CHECK-NEXT: le lr, .LBB15_3
|
|
; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit
|
|
; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload
|
|
; CHECK-NEXT: add r4, sp, #4
|
|
; CHECK-NEXT: add r11, r6
|
|
; CHECK-NEXT: add.w r5, r3, r2, lsl #1
|
|
; CHECK-NEXT: add.w r1, r1, r2, lsl #1
|
|
; CHECK-NEXT: ldm r4, {r2, r3, r4} @ 12-byte Folded Reload
|
|
; CHECK-NEXT: .LBB15_5: @ %while.end
|
|
; CHECK-NEXT: and r7, r3, #3
|
|
; CHECK-NEXT: vldrw.u32 q2, [r1]
|
|
; CHECK-NEXT: vctp.16 r7
|
|
; CHECK-NEXT: vpst
|
|
; CHECK-NEXT: vstrht.16 q2, [r11]
|
|
; CHECK-NEXT: vldrw.u32 q2, [r5]
|
|
; CHECK-NEXT: vmov.f16 r1, s6
|
|
; CHECK-NEXT: adds r7, r5, #2
|
|
; CHECK-NEXT: vmul.f16 q2, q2, r1
|
|
; CHECK-NEXT: vmov.f16 r1, s4
|
|
; CHECK-NEXT: vldrw.u32 q1, [r7]
|
|
; CHECK-NEXT: adds r7, r5, #6
|
|
; CHECK-NEXT: vfma.f16 q2, q1, r1
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5, #4]
|
|
; CHECK-NEXT: vmov.f16 r1, s2
|
|
; CHECK-NEXT: vfma.f16 q2, q1, r1
|
|
; CHECK-NEXT: vmov.f16 r1, s0
|
|
; CHECK-NEXT: vldrw.u32 q0, [r7]
|
|
; CHECK-NEXT: vfma.f16 q2, q0, r1
|
|
; CHECK-NEXT: vpst
|
|
; CHECK-NEXT: vstrht.16 q2, [r2]
|
|
; CHECK-NEXT: ldr r5, [r0, #4]
|
|
; CHECK-NEXT: .LBB15_6: @ %if.end
|
|
; CHECK-NEXT: add.w r0, r5, r3, lsl #1
|
|
; CHECK-NEXT: mov r2, r5
|
|
; CHECK-NEXT: lsr.w lr, r4, #2
|
|
; CHECK-NEXT: wls lr, lr, .LBB15_10
|
|
; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader
|
|
; CHECK-NEXT: bic r7, r4, #3
|
|
; CHECK-NEXT: adds r1, r7, r3
|
|
; CHECK-NEXT: mov r3, r2
|
|
; CHECK-NEXT: add.w r1, r2, r1, lsl #1
|
|
; CHECK-NEXT: .LBB15_8: @ %while.body51
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #8
|
|
; CHECK-NEXT: vstrb.8 q0, [r3], #8
|
|
; CHECK-NEXT: le lr, .LBB15_8
|
|
; CHECK-NEXT: @ %bb.9: @ %while.end55.loopexit
|
|
; CHECK-NEXT: add.w r2, r2, r7, lsl #1
|
|
; CHECK-NEXT: mov r0, r1
|
|
; CHECK-NEXT: .LBB15_10: @ %while.end55
|
|
; CHECK-NEXT: ands r1, r4, #3
|
|
; CHECK-NEXT: beq .LBB15_12
|
|
; CHECK-NEXT: @ %bb.11: @ %if.then59
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0]
|
|
; CHECK-NEXT: vctp.16 r1
|
|
; CHECK-NEXT: vpst
|
|
; CHECK-NEXT: vstrht.16 q0, [r2]
|
|
; CHECK-NEXT: .LBB15_12: @ %if.end61
|
|
; CHECK-NEXT: add sp, #16
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
entry:
|
|
%pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1
|
|
%0 = load half*, half** %pState1, align 4
|
|
%pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 2
|
|
%1 = load half*, half** %pCoeffs2, align 4
|
|
%numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 0
|
|
%2 = load i16, i16* %numTaps3, align 4
|
|
%conv = zext i16 %2 to i32
|
|
%sub = add nsw i32 %conv, -1
|
|
%cmp = icmp ult i32 %sub, 4
|
|
br i1 %cmp, label %if.then, label %if.end
|
|
|
|
if.then: ; preds = %entry
|
|
%arrayidx = getelementptr inbounds half, half* %0, i32 %sub
|
|
%incdec.ptr = getelementptr inbounds half, half* %1, i32 1
|
|
%3 = load half, half* %1, align 4
|
|
%incdec.ptr6 = getelementptr inbounds half, half* %1, i32 2
|
|
%4 = load half, half* %incdec.ptr, align 4
|
|
%incdec.ptr7 = getelementptr inbounds half, half* %1, i32 3
|
|
%5 = load half, half* %incdec.ptr6, align 4
|
|
%6 = load half, half* %incdec.ptr7, align 4
|
|
%shr = lshr i32 %blockSize, 2
|
|
%cmp9146 = icmp eq i32 %shr, 0
|
|
%.pre161 = insertelement <8 x half> undef, half %3, i32 0
|
|
%.pre162 = shufflevector <8 x half> %.pre161, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%.pre163 = insertelement <8 x half> undef, half %4, i32 0
|
|
%.pre164 = shufflevector <8 x half> %.pre163, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%.pre165 = insertelement <8 x half> undef, half %5, i32 0
|
|
%.pre166 = shufflevector <8 x half> %.pre165, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%.pre167 = insertelement <8 x half> undef, half %6, i32 0
|
|
%.pre168 = shufflevector <8 x half> %.pre167, <8 x half> undef, <8 x i32> zeroinitializer
|
|
br i1 %cmp9146, label %while.end, label %while.body.lr.ph
|
|
|
|
while.body.lr.ph: ; preds = %if.then
|
|
%7 = and i32 %blockSize, -4
|
|
%scevgep158 = getelementptr half, half* %pDst, i32 %7
|
|
br label %while.body
|
|
|
|
while.body: ; preds = %while.body.lr.ph, %while.body
|
|
%pStateCur.0151 = phi half* [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.body ]
|
|
%pSamples.0150 = phi half* [ %0, %while.body.lr.ph ], [ %add.ptr24, %while.body ]
|
|
%pOutput.0149 = phi half* [ %pDst, %while.body.lr.ph ], [ %add.ptr23, %while.body ]
|
|
%pTempSrc.0148 = phi half* [ %pSrc, %while.body.lr.ph ], [ %add.ptr11, %while.body ]
|
|
%blkCnt.0147 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec, %while.body ]
|
|
%8 = bitcast half* %pTempSrc.0148 to <8 x half>*
|
|
%9 = load <8 x half>, <8 x half>* %8, align 4
|
|
%10 = bitcast half* %pStateCur.0151 to <8 x half>*
|
|
store <8 x half> %9, <8 x half>* %10, align 4
|
|
%add.ptr = getelementptr inbounds half, half* %pStateCur.0151, i32 4
|
|
%add.ptr11 = getelementptr inbounds half, half* %pTempSrc.0148, i32 4
|
|
%11 = bitcast half* %pSamples.0150 to <8 x half>*
|
|
%12 = load <8 x half>, <8 x half>* %11, align 4
|
|
%13 = fmul fast <8 x half> %12, %.pre162
|
|
%arrayidx12 = getelementptr inbounds half, half* %pSamples.0150, i32 1
|
|
%14 = bitcast half* %arrayidx12 to <8 x half>*
|
|
%15 = load <8 x half>, <8 x half>* %14, align 4
|
|
%mul = fmul fast <8 x half> %15, %.pre164
|
|
%add = fadd fast <8 x half> %mul, %13
|
|
%arrayidx13 = getelementptr inbounds half, half* %pSamples.0150, i32 2
|
|
%16 = bitcast half* %arrayidx13 to <8 x half>*
|
|
%17 = load <8 x half>, <8 x half>* %16, align 4
|
|
%mul16 = fmul fast <8 x half> %17, %.pre166
|
|
%add17 = fadd fast <8 x half> %add, %mul16
|
|
%arrayidx18 = getelementptr inbounds half, half* %pSamples.0150, i32 3
|
|
%18 = bitcast half* %arrayidx18 to <8 x half>*
|
|
%19 = load <8 x half>, <8 x half>* %18, align 4
|
|
%mul21 = fmul fast <8 x half> %19, %.pre168
|
|
%add22 = fadd fast <8 x half> %add17, %mul21
|
|
%20 = bitcast half* %pOutput.0149 to <8 x half>*
|
|
store <8 x half> %add22, <8 x half>* %20, align 4
|
|
%add.ptr23 = getelementptr inbounds half, half* %pOutput.0149, i32 4
|
|
%add.ptr24 = getelementptr inbounds half, half* %pSamples.0150, i32 4
|
|
%dec = add nsw i32 %blkCnt.0147, -1
|
|
%cmp9 = icmp eq i32 %dec, 0
|
|
br i1 %cmp9, label %while.end.loopexit, label %while.body
|
|
|
|
while.end.loopexit: ; preds = %while.body
|
|
%scevgep157 = getelementptr half, half* %pSrc, i32 %7
|
|
%scevgep159 = getelementptr half, half* %0, i32 %7
|
|
br label %while.end
|
|
|
|
while.end: ; preds = %if.then, %while.end.loopexit
|
|
%pTempSrc.0.lcssa = phi half* [ %scevgep157, %while.end.loopexit ], [ %pSrc, %if.then ]
|
|
%pOutput.0.lcssa = phi half* [ %scevgep158, %while.end.loopexit ], [ %pDst, %if.then ]
|
|
%pSamples.0.lcssa = phi half* [ %scevgep159, %while.end.loopexit ], [ %0, %if.then ]
|
|
%pStateCur.0.lcssa = phi half* [ %add.ptr, %while.end.loopexit ], [ %arrayidx, %if.then ]
|
|
%and = and i32 %blockSize, 3
|
|
%21 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %and)
|
|
%22 = bitcast half* %pTempSrc.0.lcssa to <8 x half>*
|
|
%23 = load <8 x half>, <8 x half>* %22, align 4
|
|
%24 = bitcast half* %pStateCur.0.lcssa to <8 x half>*
|
|
tail call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %23, <8 x half>* %24, i32 4, <8 x i1> %21)
|
|
%25 = bitcast half* %pSamples.0.lcssa to <8 x half>*
|
|
%26 = load <8 x half>, <8 x half>* %25, align 4
|
|
%27 = fmul fast <8 x half> %26, %.pre162
|
|
%arrayidx29 = getelementptr inbounds half, half* %pSamples.0.lcssa, i32 1
|
|
%28 = bitcast half* %arrayidx29 to <8 x half>*
|
|
%29 = load <8 x half>, <8 x half>* %28, align 4
|
|
%mul32 = fmul fast <8 x half> %29, %.pre164
|
|
%add33 = fadd fast <8 x half> %mul32, %27
|
|
%arrayidx34 = getelementptr inbounds half, half* %pSamples.0.lcssa, i32 2
|
|
%30 = bitcast half* %arrayidx34 to <8 x half>*
|
|
%31 = load <8 x half>, <8 x half>* %30, align 4
|
|
%mul37 = fmul fast <8 x half> %31, %.pre166
|
|
%add38 = fadd fast <8 x half> %add33, %mul37
|
|
%arrayidx39 = getelementptr inbounds half, half* %pSamples.0.lcssa, i32 3
|
|
%32 = bitcast half* %arrayidx39 to <8 x half>*
|
|
%33 = load <8 x half>, <8 x half>* %32, align 4
|
|
%mul42 = fmul fast <8 x half> %33, %.pre168
|
|
%add43 = fadd fast <8 x half> %add38, %mul42
|
|
%34 = bitcast half* %pOutput.0.lcssa to <8 x half>*
|
|
tail call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %add43, <8 x half>* %34, i32 4, <8 x i1> %21)
|
|
%.pre = load half*, half** %pState1, align 4
|
|
br label %if.end
|
|
|
|
if.end: ; preds = %while.end, %entry
|
|
%35 = phi half* [ %.pre, %while.end ], [ %0, %entry ]
|
|
%arrayidx45 = getelementptr inbounds half, half* %35, i32 %blockSize
|
|
%shr47 = lshr i32 %conv, 2
|
|
%cmp49141 = icmp eq i32 %shr47, 0
|
|
br i1 %cmp49141, label %while.end55, label %while.body51.preheader
|
|
|
|
while.body51.preheader: ; preds = %if.end
|
|
%36 = and i32 %conv, 65532
|
|
%37 = add i32 %36, %blockSize
|
|
%scevgep = getelementptr half, half* %35, i32 %37
|
|
br label %while.body51
|
|
|
|
while.body51: ; preds = %while.body51.preheader, %while.body51
|
|
%pTempSrc.1144 = phi half* [ %add.ptr52, %while.body51 ], [ %arrayidx45, %while.body51.preheader ]
|
|
%pTempDest.0143 = phi half* [ %add.ptr53, %while.body51 ], [ %35, %while.body51.preheader ]
|
|
%blkCnt.1142 = phi i32 [ %dec54, %while.body51 ], [ %shr47, %while.body51.preheader ]
|
|
%38 = bitcast half* %pTempSrc.1144 to <8 x half>*
|
|
%39 = load <8 x half>, <8 x half>* %38, align 4
|
|
%40 = bitcast half* %pTempDest.0143 to <8 x half>*
|
|
store <8 x half> %39, <8 x half>* %40, align 4
|
|
%add.ptr52 = getelementptr inbounds half, half* %pTempSrc.1144, i32 4
|
|
%add.ptr53 = getelementptr inbounds half, half* %pTempDest.0143, i32 4
|
|
%dec54 = add nsw i32 %blkCnt.1142, -1
|
|
%cmp49 = icmp eq i32 %dec54, 0
|
|
br i1 %cmp49, label %while.end55.loopexit, label %while.body51
|
|
|
|
while.end55.loopexit: ; preds = %while.body51
|
|
%scevgep156 = getelementptr half, half* %35, i32 %36
|
|
br label %while.end55
|
|
|
|
while.end55: ; preds = %while.end55.loopexit, %if.end
|
|
%pTempDest.0.lcssa = phi half* [ %35, %if.end ], [ %scevgep156, %while.end55.loopexit ]
|
|
%pTempSrc.1.lcssa = phi half* [ %arrayidx45, %if.end ], [ %scevgep, %while.end55.loopexit ]
|
|
%and56 = and i32 %conv, 3
|
|
%cmp57 = icmp eq i32 %and56, 0
|
|
br i1 %cmp57, label %if.end61, label %if.then59
|
|
|
|
if.then59: ; preds = %while.end55
|
|
%41 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %and56)
|
|
%42 = bitcast half* %pTempSrc.1.lcssa to <8 x half>*
|
|
%43 = load <8 x half>, <8 x half>* %42, align 4
|
|
%44 = bitcast half* %pTempDest.0.lcssa to <8 x half>*
|
|
tail call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %43, <8 x half>* %44, i32 4, <8 x i1> %41)
|
|
br label %if.end61
|
|
|
|
if.end61: ; preds = %while.end55, %if.then59
|
|
ret void
|
|
}
|
|
|
|
|
|
define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* nocapture readonly %pSrc, half* nocapture %pDst, i32 %blockSize) {
|
|
; CHECK-LABEL: fir:
|
|
; CHECK: @ %bb.0: @ %entry
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
; CHECK-NEXT: .pad #28
|
|
; CHECK-NEXT: sub sp, #28
|
|
; CHECK-NEXT: cmp r3, #8
|
|
; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
|
|
; CHECK-NEXT: blo.w .LBB16_12
|
|
; CHECK-NEXT: @ %bb.1: @ %if.then
|
|
; CHECK-NEXT: movs r7, #0
|
|
; CHECK-NEXT: cmp.w r7, r3, lsr #2
|
|
; CHECK-NEXT: beq.w .LBB16_12
|
|
; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
|
|
; CHECK-NEXT: ldrh r4, [r0]
|
|
; CHECK-NEXT: movs r1, #1
|
|
; CHECK-NEXT: ldrd r5, r12, [r0, #4]
|
|
; CHECK-NEXT: lsr.w r11, r3, #2
|
|
; CHECK-NEXT: sub.w r0, r4, #8
|
|
; CHECK-NEXT: rsbs r3, r4, #0
|
|
; CHECK-NEXT: add.w r7, r0, r0, lsr #29
|
|
; CHECK-NEXT: and r0, r0, #7
|
|
; CHECK-NEXT: asrs r6, r7, #3
|
|
; CHECK-NEXT: cmp r6, #1
|
|
; CHECK-NEXT: it gt
|
|
; CHECK-NEXT: asrgt r1, r7, #3
|
|
; CHECK-NEXT: add.w r7, r5, r4, lsl #1
|
|
; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
|
|
; CHECK-NEXT: subs r1, r7, #2
|
|
; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
|
|
; CHECK-NEXT: add.w r3, r12, #16
|
|
; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
|
|
; CHECK-NEXT: adds r0, #1
|
|
; CHECK-NEXT: str r4, [sp, #16] @ 4-byte Spill
|
|
; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
|
|
; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill
|
|
; CHECK-NEXT: b .LBB16_4
|
|
; CHECK-NEXT: .LBB16_3: @ %while.end
|
|
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
|
|
; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
|
|
; CHECK-NEXT: subs.w r11, r11, #1
|
|
; CHECK-NEXT: vstrb.8 q0, [r2], #8
|
|
; CHECK-NEXT: add.w r0, r7, r0, lsl #1
|
|
; CHECK-NEXT: add.w r5, r0, #8
|
|
; CHECK-NEXT: beq.w .LBB16_12
|
|
; CHECK-NEXT: .LBB16_4: @ %while.body
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
; CHECK-NEXT: @ Child Loop BB16_6 Depth 2
|
|
; CHECK-NEXT: @ Child Loop BB16_10 Depth 2
|
|
; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload
|
|
; CHECK-NEXT: ldrh.w lr, [r12, #14]
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #8
|
|
; CHECK-NEXT: ldrh.w r10, [r12, #12]
|
|
; CHECK-NEXT: ldrh.w r7, [r12, #10]
|
|
; CHECK-NEXT: ldrh.w r4, [r12, #8]
|
|
; CHECK-NEXT: ldrh.w r3, [r12, #6]
|
|
; CHECK-NEXT: ldrh.w r6, [r12, #4]
|
|
; CHECK-NEXT: ldrh.w r8, [r12, #2]
|
|
; CHECK-NEXT: ldrh.w r9, [r12]
|
|
; CHECK-NEXT: vstrb.8 q0, [r1], #8
|
|
; CHECK-NEXT: vldrw.u32 q0, [r5]
|
|
; CHECK-NEXT: str r0, [sp, #24] @ 4-byte Spill
|
|
; CHECK-NEXT: adds r0, r5, #2
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0]
|
|
; CHECK-NEXT: vmul.f16 q0, q0, r9
|
|
; CHECK-NEXT: adds r0, r5, #6
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r8
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5, #4]
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r6
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0]
|
|
; CHECK-NEXT: add.w r0, r5, #10
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r3
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5, #8]
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r4
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0]
|
|
; CHECK-NEXT: add.w r0, r5, #14
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r7
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5, #12]
|
|
; CHECK-NEXT: add.w r7, r5, #16
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r10
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0]
|
|
; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
|
|
; CHECK-NEXT: vfma.f16 q0, q1, lr
|
|
; CHECK-NEXT: cmp r0, #16
|
|
; CHECK-NEXT: blo .LBB16_7
|
|
; CHECK-NEXT: @ %bb.5: @ %for.body.preheader
|
|
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
|
|
; CHECK-NEXT: ldr.w lr, [sp] @ 4-byte Reload
|
|
; CHECK-NEXT: dls lr, lr
|
|
; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
|
|
; CHECK-NEXT: .LBB16_6: @ %for.body
|
|
; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
; CHECK-NEXT: ldrh r0, [r6]
|
|
; CHECK-NEXT: vldrw.u32 q1, [r7]
|
|
; CHECK-NEXT: adds r3, r7, #2
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r0
|
|
; CHECK-NEXT: vldrw.u32 q1, [r3]
|
|
; CHECK-NEXT: ldrh r0, [r6, #2]
|
|
; CHECK-NEXT: adds r3, r7, #6
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r0
|
|
; CHECK-NEXT: ldrh r0, [r6, #4]
|
|
; CHECK-NEXT: vldrw.u32 q1, [r7, #4]
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r0
|
|
; CHECK-NEXT: vldrw.u32 q1, [r3]
|
|
; CHECK-NEXT: ldrh r0, [r6, #6]
|
|
; CHECK-NEXT: add.w r3, r7, #10
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r0
|
|
; CHECK-NEXT: ldrh r0, [r6, #8]
|
|
; CHECK-NEXT: vldrw.u32 q1, [r7, #8]
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r0
|
|
; CHECK-NEXT: vldrw.u32 q1, [r3]
|
|
; CHECK-NEXT: ldrh r0, [r6, #10]
|
|
; CHECK-NEXT: ldrh r3, [r6, #14]
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r0
|
|
; CHECK-NEXT: ldrh r0, [r6, #12]
|
|
; CHECK-NEXT: vldrw.u32 q1, [r7, #12]
|
|
; CHECK-NEXT: adds r6, #16
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r0
|
|
; CHECK-NEXT: add.w r0, r7, #14
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0]
|
|
; CHECK-NEXT: adds r7, #16
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r3
|
|
; CHECK-NEXT: le lr, .LBB16_6
|
|
; CHECK-NEXT: b .LBB16_8
|
|
; CHECK-NEXT: .LBB16_7: @ in Loop: Header=BB16_4 Depth=1
|
|
; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
|
|
; CHECK-NEXT: .LBB16_8: @ %for.end
|
|
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
|
|
; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
|
|
; CHECK-NEXT: cmp r0, #0
|
|
; CHECK-NEXT: beq.w .LBB16_3
|
|
; CHECK-NEXT: @ %bb.9: @ %while.body76.preheader
|
|
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
|
|
; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
|
|
; CHECK-NEXT: mov r5, r7
|
|
; CHECK-NEXT: .LBB16_10: @ %while.body76
|
|
; CHECK-NEXT: @ Parent Loop BB16_4 Depth=1
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
; CHECK-NEXT: ldrh r3, [r6], #2
|
|
; CHECK-NEXT: vldrh.u16 q1, [r5], #2
|
|
; CHECK-NEXT: subs r0, #1
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r3
|
|
; CHECK-NEXT: cmp r0, #1
|
|
; CHECK-NEXT: bgt .LBB16_10
|
|
; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit
|
|
; CHECK-NEXT: @ in Loop: Header=BB16_4 Depth=1
|
|
; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
|
|
; CHECK-NEXT: add.w r7, r7, r0, lsl #1
|
|
; CHECK-NEXT: b .LBB16_3
|
|
; CHECK-NEXT: .LBB16_12: @ %if.end
|
|
; CHECK-NEXT: add sp, #28
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
entry:
|
|
%pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1
|
|
%0 = load half*, half** %pState1, align 4
|
|
%pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 2
|
|
%1 = load half*, half** %pCoeffs2, align 4
|
|
%numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 0
|
|
%2 = load i16, i16* %numTaps3, align 4
|
|
%conv = zext i16 %2 to i32
|
|
%cmp = icmp ugt i32 %blockSize, 7
|
|
br i1 %cmp, label %if.then, label %if.end
|
|
|
|
if.then: ; preds = %entry
|
|
%shr = lshr i32 %blockSize, 2
|
|
%cmp5217 = icmp eq i32 %shr, 0
|
|
br i1 %cmp5217, label %if.end, label %while.body.lr.ph
|
|
|
|
while.body.lr.ph: ; preds = %if.then
|
|
%sub = add nsw i32 %conv, -1
|
|
%arrayidx = getelementptr inbounds half, half* %0, i32 %sub
|
|
%incdec.ptr = getelementptr inbounds half, half* %1, i32 1
|
|
%incdec.ptr7 = getelementptr inbounds half, half* %1, i32 2
|
|
%incdec.ptr8 = getelementptr inbounds half, half* %1, i32 3
|
|
%incdec.ptr9 = getelementptr inbounds half, half* %1, i32 4
|
|
%incdec.ptr10 = getelementptr inbounds half, half* %1, i32 5
|
|
%incdec.ptr11 = getelementptr inbounds half, half* %1, i32 6
|
|
%incdec.ptr12 = getelementptr inbounds half, half* %1, i32 7
|
|
%sub37 = add nsw i32 %conv, -8
|
|
%div = sdiv i32 %sub37, 8
|
|
%pCoeffsCur.0199 = getelementptr inbounds half, half* %1, i32 8
|
|
%cmp38201 = icmp ugt i16 %2, 15
|
|
%and = and i32 %sub37, 7
|
|
%cmp74210 = icmp eq i32 %and, 0
|
|
%idx.neg = sub nsw i32 0, %conv
|
|
%3 = icmp sgt i32 %div, 1
|
|
%smax = select i1 %3, i32 %div, i32 1
|
|
br label %while.body
|
|
|
|
while.body: ; preds = %while.body.lr.ph, %while.end
|
|
%blkCnt.0222 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec84, %while.end ]
|
|
%pStateCur.0221 = phi half* [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.end ]
|
|
%pSamples.0220 = phi half* [ %0, %while.body.lr.ph ], [ %add.ptr83, %while.end ]
|
|
%pTempSrc.0219 = phi half* [ %pSrc, %while.body.lr.ph ], [ %add.ptr14, %while.end ]
|
|
%pOutput.0218 = phi half* [ %pDst, %while.body.lr.ph ], [ %add.ptr81, %while.end ]
|
|
%4 = load half, half* %1, align 4
|
|
%5 = load half, half* %incdec.ptr, align 4
|
|
%6 = load half, half* %incdec.ptr7, align 4
|
|
%7 = load half, half* %incdec.ptr8, align 4
|
|
%8 = load half, half* %incdec.ptr9, align 4
|
|
%9 = load half, half* %incdec.ptr10, align 4
|
|
%10 = load half, half* %incdec.ptr11, align 4
|
|
%11 = load half, half* %incdec.ptr12, align 4
|
|
%12 = bitcast half* %pTempSrc.0219 to <8 x half>*
|
|
%13 = load <8 x half>, <8 x half>* %12, align 4
|
|
%14 = bitcast half* %pStateCur.0221 to <8 x half>*
|
|
store <8 x half> %13, <8 x half>* %14, align 4
|
|
%add.ptr = getelementptr inbounds half, half* %pStateCur.0221, i32 4
|
|
%add.ptr14 = getelementptr inbounds half, half* %pTempSrc.0219, i32 4
|
|
%15 = bitcast half* %pSamples.0220 to <8 x half>*
|
|
%16 = load <8 x half>, <8 x half>* %15, align 4
|
|
%.splatinsert = insertelement <8 x half> undef, half %4, i32 0
|
|
%.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%17 = fmul fast <8 x half> %16, %.splat
|
|
%arrayidx15 = getelementptr inbounds half, half* %pSamples.0220, i32 1
|
|
%18 = bitcast half* %arrayidx15 to <8 x half>*
|
|
%19 = load <8 x half>, <8 x half>* %18, align 4
|
|
%.splatinsert16 = insertelement <8 x half> undef, half %5, i32 0
|
|
%.splat17 = shufflevector <8 x half> %.splatinsert16, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%20 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %19, <8 x half> %.splat17, <8 x half> %17)
|
|
%arrayidx18 = getelementptr inbounds half, half* %pSamples.0220, i32 2
|
|
%21 = bitcast half* %arrayidx18 to <8 x half>*
|
|
%22 = load <8 x half>, <8 x half>* %21, align 4
|
|
%.splatinsert19 = insertelement <8 x half> undef, half %6, i32 0
|
|
%.splat20 = shufflevector <8 x half> %.splatinsert19, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%23 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %22, <8 x half> %.splat20, <8 x half> %20)
|
|
%arrayidx21 = getelementptr inbounds half, half* %pSamples.0220, i32 3
|
|
%24 = bitcast half* %arrayidx21 to <8 x half>*
|
|
%25 = load <8 x half>, <8 x half>* %24, align 4
|
|
%.splatinsert22 = insertelement <8 x half> undef, half %7, i32 0
|
|
%.splat23 = shufflevector <8 x half> %.splatinsert22, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%26 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %25, <8 x half> %.splat23, <8 x half> %23)
|
|
%arrayidx24 = getelementptr inbounds half, half* %pSamples.0220, i32 4
|
|
%27 = bitcast half* %arrayidx24 to <8 x half>*
|
|
%28 = load <8 x half>, <8 x half>* %27, align 4
|
|
%.splatinsert25 = insertelement <8 x half> undef, half %8, i32 0
|
|
%.splat26 = shufflevector <8 x half> %.splatinsert25, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%29 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %28, <8 x half> %.splat26, <8 x half> %26)
|
|
%arrayidx27 = getelementptr inbounds half, half* %pSamples.0220, i32 5
|
|
%30 = bitcast half* %arrayidx27 to <8 x half>*
|
|
%31 = load <8 x half>, <8 x half>* %30, align 4
|
|
%.splatinsert28 = insertelement <8 x half> undef, half %9, i32 0
|
|
%.splat29 = shufflevector <8 x half> %.splatinsert28, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%32 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %31, <8 x half> %.splat29, <8 x half> %29)
|
|
%arrayidx30 = getelementptr inbounds half, half* %pSamples.0220, i32 6
|
|
%33 = bitcast half* %arrayidx30 to <8 x half>*
|
|
%34 = load <8 x half>, <8 x half>* %33, align 4
|
|
%.splatinsert31 = insertelement <8 x half> undef, half %10, i32 0
|
|
%.splat32 = shufflevector <8 x half> %.splatinsert31, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%35 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %34, <8 x half> %.splat32, <8 x half> %32)
|
|
%arrayidx33 = getelementptr inbounds half, half* %pSamples.0220, i32 7
|
|
%36 = bitcast half* %arrayidx33 to <8 x half>*
|
|
%37 = load <8 x half>, <8 x half>* %36, align 4
|
|
%.splatinsert34 = insertelement <8 x half> undef, half %11, i32 0
|
|
%.splat35 = shufflevector <8 x half> %.splatinsert34, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%38 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %37, <8 x half> %.splat35, <8 x half> %35)
|
|
%pSamples.1200 = getelementptr inbounds half, half* %pSamples.0220, i32 8
|
|
br i1 %cmp38201, label %for.body, label %for.end
|
|
|
|
for.body: ; preds = %while.body, %for.body
|
|
%pSamples.1207 = phi half* [ %pSamples.1, %for.body ], [ %pSamples.1200, %while.body ]
|
|
%pCoeffsCur.0206 = phi half* [ %pCoeffsCur.0, %for.body ], [ %pCoeffsCur.0199, %while.body ]
|
|
%.pn205 = phi half* [ %pCoeffsCur.0206, %for.body ], [ %1, %while.body ]
|
|
%i.0204 = phi i32 [ %inc, %for.body ], [ 0, %while.body ]
|
|
%vecAcc0.0203 = phi <8 x half> [ %70, %for.body ], [ %38, %while.body ]
|
|
%pSamples.0.pn202 = phi half* [ %pSamples.1207, %for.body ], [ %pSamples.0220, %while.body ]
|
|
%incdec.ptr40 = getelementptr inbounds half, half* %.pn205, i32 9
|
|
%39 = load half, half* %pCoeffsCur.0206, align 4
|
|
%incdec.ptr41 = getelementptr inbounds half, half* %.pn205, i32 10
|
|
%40 = load half, half* %incdec.ptr40, align 4
|
|
%incdec.ptr42 = getelementptr inbounds half, half* %.pn205, i32 11
|
|
%41 = load half, half* %incdec.ptr41, align 4
|
|
%incdec.ptr43 = getelementptr inbounds half, half* %.pn205, i32 12
|
|
%42 = load half, half* %incdec.ptr42, align 4
|
|
%incdec.ptr44 = getelementptr inbounds half, half* %.pn205, i32 13
|
|
%43 = load half, half* %incdec.ptr43, align 4
|
|
%incdec.ptr45 = getelementptr inbounds half, half* %.pn205, i32 14
|
|
%44 = load half, half* %incdec.ptr44, align 4
|
|
%incdec.ptr46 = getelementptr inbounds half, half* %.pn205, i32 15
|
|
%45 = load half, half* %incdec.ptr45, align 4
|
|
%46 = load half, half* %incdec.ptr46, align 4
|
|
%47 = bitcast half* %pSamples.1207 to <8 x half>*
|
|
%48 = load <8 x half>, <8 x half>* %47, align 4
|
|
%.splatinsert48 = insertelement <8 x half> undef, half %39, i32 0
|
|
%.splat49 = shufflevector <8 x half> %.splatinsert48, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%49 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %48, <8 x half> %.splat49, <8 x half> %vecAcc0.0203)
|
|
%arrayidx50 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 9
|
|
%50 = bitcast half* %arrayidx50 to <8 x half>*
|
|
%51 = load <8 x half>, <8 x half>* %50, align 4
|
|
%.splatinsert51 = insertelement <8 x half> undef, half %40, i32 0
|
|
%.splat52 = shufflevector <8 x half> %.splatinsert51, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%52 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %51, <8 x half> %.splat52, <8 x half> %49)
|
|
%arrayidx53 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 10
|
|
%53 = bitcast half* %arrayidx53 to <8 x half>*
|
|
%54 = load <8 x half>, <8 x half>* %53, align 4
|
|
%.splatinsert54 = insertelement <8 x half> undef, half %41, i32 0
|
|
%.splat55 = shufflevector <8 x half> %.splatinsert54, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%55 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %54, <8 x half> %.splat55, <8 x half> %52)
|
|
%arrayidx56 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 11
|
|
%56 = bitcast half* %arrayidx56 to <8 x half>*
|
|
%57 = load <8 x half>, <8 x half>* %56, align 4
|
|
%.splatinsert57 = insertelement <8 x half> undef, half %42, i32 0
|
|
%.splat58 = shufflevector <8 x half> %.splatinsert57, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%58 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %57, <8 x half> %.splat58, <8 x half> %55)
|
|
%arrayidx59 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 12
|
|
%59 = bitcast half* %arrayidx59 to <8 x half>*
|
|
%60 = load <8 x half>, <8 x half>* %59, align 4
|
|
%.splatinsert60 = insertelement <8 x half> undef, half %43, i32 0
|
|
%.splat61 = shufflevector <8 x half> %.splatinsert60, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%61 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %60, <8 x half> %.splat61, <8 x half> %58)
|
|
%arrayidx62 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 13
|
|
%62 = bitcast half* %arrayidx62 to <8 x half>*
|
|
%63 = load <8 x half>, <8 x half>* %62, align 4
|
|
%.splatinsert63 = insertelement <8 x half> undef, half %44, i32 0
|
|
%.splat64 = shufflevector <8 x half> %.splatinsert63, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%64 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %63, <8 x half> %.splat64, <8 x half> %61)
|
|
%arrayidx65 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 14
|
|
%65 = bitcast half* %arrayidx65 to <8 x half>*
|
|
%66 = load <8 x half>, <8 x half>* %65, align 4
|
|
%.splatinsert66 = insertelement <8 x half> undef, half %45, i32 0
|
|
%.splat67 = shufflevector <8 x half> %.splatinsert66, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%67 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %66, <8 x half> %.splat67, <8 x half> %64)
|
|
%arrayidx68 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 15
|
|
%68 = bitcast half* %arrayidx68 to <8 x half>*
|
|
%69 = load <8 x half>, <8 x half>* %68, align 4
|
|
%.splatinsert69 = insertelement <8 x half> undef, half %46, i32 0
|
|
%.splat70 = shufflevector <8 x half> %.splatinsert69, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%70 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %69, <8 x half> %.splat70, <8 x half> %67)
|
|
%inc = add nuw nsw i32 %i.0204, 1
|
|
%pCoeffsCur.0 = getelementptr inbounds half, half* %pCoeffsCur.0206, i32 8
|
|
%pSamples.1 = getelementptr inbounds half, half* %pSamples.1207, i32 8
|
|
%exitcond = icmp eq i32 %inc, %smax
|
|
br i1 %exitcond, label %for.end, label %for.body
|
|
|
|
for.end: ; preds = %for.body, %while.body
|
|
%vecAcc0.0.lcssa = phi <8 x half> [ %38, %while.body ], [ %70, %for.body ]
|
|
%pCoeffsCur.0.lcssa = phi half* [ %pCoeffsCur.0199, %while.body ], [ %pCoeffsCur.0, %for.body ]
|
|
%pSamples.1.lcssa = phi half* [ %pSamples.1200, %while.body ], [ %pSamples.1, %for.body ]
|
|
br i1 %cmp74210, label %while.end, label %while.body76
|
|
|
|
while.body76: ; preds = %for.end, %while.body76
|
|
%pCoeffsCur.1214 = phi half* [ %incdec.ptr77, %while.body76 ], [ %pCoeffsCur.0.lcssa, %for.end ]
|
|
%vecAcc0.1213 = phi <8 x half> [ %74, %while.body76 ], [ %vecAcc0.0.lcssa, %for.end ]
|
|
%numCnt.0212 = phi i32 [ %dec, %while.body76 ], [ %and, %for.end ]
|
|
%pSamples.2211 = phi half* [ %incdec.ptr80, %while.body76 ], [ %pSamples.1.lcssa, %for.end ]
|
|
%incdec.ptr77 = getelementptr inbounds half, half* %pCoeffsCur.1214, i32 1
|
|
%71 = load half, half* %pCoeffsCur.1214, align 4
|
|
%72 = bitcast half* %pSamples.2211 to <8 x half>*
|
|
%73 = load <8 x half>, <8 x half>* %72, align 4
|
|
%.splatinsert78 = insertelement <8 x half> undef, half %71, i32 0
|
|
%.splat79 = shufflevector <8 x half> %.splatinsert78, <8 x half> undef, <8 x i32> zeroinitializer
|
|
%74 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %73, <8 x half> %.splat79, <8 x half> %vecAcc0.1213)
|
|
%incdec.ptr80 = getelementptr inbounds half, half* %pSamples.2211, i32 1
|
|
%dec = add nsw i32 %numCnt.0212, -1
|
|
%cmp74 = icmp sgt i32 %numCnt.0212, 1
|
|
br i1 %cmp74, label %while.body76, label %while.end.loopexit
|
|
|
|
while.end.loopexit: ; preds = %while.body76
|
|
%scevgep = getelementptr half, half* %pSamples.1.lcssa, i32 %and
|
|
br label %while.end
|
|
|
|
while.end: ; preds = %while.end.loopexit, %for.end
|
|
%pSamples.2.lcssa = phi half* [ %pSamples.1.lcssa, %for.end ], [ %scevgep, %while.end.loopexit ]
|
|
%vecAcc0.1.lcssa = phi <8 x half> [ %vecAcc0.0.lcssa, %for.end ], [ %74, %while.end.loopexit ]
|
|
%75 = bitcast half* %pOutput.0218 to <8 x half>*
|
|
store <8 x half> %vecAcc0.1.lcssa, <8 x half>* %75, align 4
|
|
%add.ptr81 = getelementptr inbounds half, half* %pOutput.0218, i32 4
|
|
%add.ptr82 = getelementptr inbounds half, half* %pSamples.2.lcssa, i32 4
|
|
%add.ptr83 = getelementptr inbounds half, half* %add.ptr82, i32 %idx.neg
|
|
%dec84 = add nsw i32 %blkCnt.0222, -1
|
|
%cmp5 = icmp eq i32 %dec84, 0
|
|
br i1 %cmp5, label %if.end, label %while.body
|
|
|
|
if.end: ; preds = %while.end, %if.then, %entry
|
|
ret void
|
|
}
|
|
|
|
declare void @llvm.assume(i1)
|
|
declare <8 x i1> @llvm.arm.mve.vctp16(i32)
|
|
declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
|
|
declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32 immarg, <8 x i1>)
|