1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 03:02:36 +01:00
llvm-mirror/test/CodeGen/Thumb2/mve-float32regloops.ll

2239 lines
106 KiB
LLVM
Raw Normal View History

2020-03-20 09:25:19 +01:00
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
2020-03-20 09:25:19 +01:00
define arm_aapcs_vfpcc void @test_fadd(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) {
; CHECK-LABEL: test_fadd:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB0_1: @ %vector.ph
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: .LBB0_2: @ %vector.body
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vadd.f32 q0, q0, r3
; CHECK-NEXT: vstrb.8 q0, [r1], #16
; CHECK-NEXT: bne .LBB0_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: bx lr
entry:
%0 = and i32 %n, 7
%cmp = icmp eq i32 %0, 0
tail call void @llvm.assume(i1 %cmp)
%cmp18 = icmp sgt i32 %n, 0
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
%broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%1 = getelementptr inbounds float, float* %A, i32 %index
%2 = bitcast float* %1 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %2, align 4
%3 = fadd fast <4 x float> %wide.load, %broadcast.splat11
%4 = getelementptr inbounds float, float* %C, i32 %index
%5 = bitcast float* %4 to <4 x float>*
store <4 x float> %3, <4 x float>* %5, align 4
%index.next = add i32 %index, 4
%6 = icmp eq i32 %index.next, %n
br i1 %6, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @test_fadd_r(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) {
; CHECK-LABEL: test_fadd_r:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB1_1: @ %vector.ph
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: .LBB1_2: @ %vector.body
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vadd.f32 q0, q0, r3
; CHECK-NEXT: vstrb.8 q0, [r1], #16
; CHECK-NEXT: bne .LBB1_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: bx lr
entry:
%0 = and i32 %n, 7
%cmp = icmp eq i32 %0, 0
tail call void @llvm.assume(i1 %cmp)
%cmp18 = icmp sgt i32 %n, 0
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
%broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%1 = getelementptr inbounds float, float* %A, i32 %index
%2 = bitcast float* %1 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %2, align 4
%3 = fadd fast <4 x float> %broadcast.splat11, %wide.load
%4 = getelementptr inbounds float, float* %C, i32 %index
%5 = bitcast float* %4 to <4 x float>*
store <4 x float> %3, <4 x float>* %5, align 4
%index.next = add i32 %index, 4
%6 = icmp eq i32 %index.next, %n
br i1 %6, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @test_fmul(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) {
; CHECK-LABEL: test_fmul:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB2_1: @ %vector.ph
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: .LBB2_2: @ %vector.body
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmul.f32 q0, q0, r3
; CHECK-NEXT: vstrb.8 q0, [r1], #16
; CHECK-NEXT: bne .LBB2_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: bx lr
entry:
%0 = and i32 %n, 7
%cmp = icmp eq i32 %0, 0
tail call void @llvm.assume(i1 %cmp)
%cmp18 = icmp sgt i32 %n, 0
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
%broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%1 = getelementptr inbounds float, float* %A, i32 %index
%2 = bitcast float* %1 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %2, align 4
%3 = fmul fast <4 x float> %wide.load, %broadcast.splat11
%4 = getelementptr inbounds float, float* %C, i32 %index
%5 = bitcast float* %4 to <4 x float>*
store <4 x float> %3, <4 x float>* %5, align 4
%index.next = add i32 %index, 4
%6 = icmp eq i32 %index.next, %n
br i1 %6, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @test_fmul_r(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) {
; CHECK-LABEL: test_fmul_r:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB3_1: @ %vector.ph
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: .LBB3_2: @ %vector.body
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vmul.f32 q0, q0, r3
; CHECK-NEXT: vstrb.8 q0, [r1], #16
; CHECK-NEXT: bne .LBB3_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: bx lr
entry:
%0 = and i32 %n, 7
%cmp = icmp eq i32 %0, 0
tail call void @llvm.assume(i1 %cmp)
%cmp18 = icmp sgt i32 %n, 0
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
%broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%1 = getelementptr inbounds float, float* %A, i32 %index
%2 = bitcast float* %1 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %2, align 4
%3 = fmul fast <4 x float> %broadcast.splat11, %wide.load
%4 = getelementptr inbounds float, float* %C, i32 %index
%5 = bitcast float* %4 to <4 x float>*
store <4 x float> %3, <4 x float>* %5, align 4
%index.next = add i32 %index, 4
%6 = icmp eq i32 %index.next, %n
br i1 %6, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @test_fsub(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) {
; CHECK-LABEL: test_fsub:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB4_1: @ %vector.ph
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: .LBB4_2: @ %vector.body
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vsub.f32 q0, q0, r3
; CHECK-NEXT: vstrb.8 q0, [r1], #16
; CHECK-NEXT: bne .LBB4_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: bx lr
entry:
%0 = and i32 %n, 7
%cmp = icmp eq i32 %0, 0
tail call void @llvm.assume(i1 %cmp)
%cmp18 = icmp sgt i32 %n, 0
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
%broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%1 = getelementptr inbounds float, float* %A, i32 %index
%2 = bitcast float* %1 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %2, align 4
%3 = fsub fast <4 x float> %wide.load, %broadcast.splat11
%4 = getelementptr inbounds float, float* %C, i32 %index
%5 = bitcast float* %4 to <4 x float>*
store <4 x float> %3, <4 x float>* %5, align 4
%index.next = add i32 %index, 4
%6 = icmp eq i32 %index.next, %n
br i1 %6, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @test_fsub_r(float* noalias nocapture readonly %A, float %B, float* noalias nocapture %C, i32 %n) {
; CHECK-LABEL: test_fsub_r:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB5_1: @ %vector.ph
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: vdup.32 q0, r3
; CHECK-NEXT: .LBB5_2: @ %vector.body
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vsub.f32 q1, q0, q1
; CHECK-NEXT: vstrb.8 q1, [r1], #16
; CHECK-NEXT: bne .LBB5_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: bx lr
entry:
%0 = and i32 %n, 7
%cmp = icmp eq i32 %0, 0
tail call void @llvm.assume(i1 %cmp)
%cmp18 = icmp sgt i32 %n, 0
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%broadcast.splatinsert10 = insertelement <4 x float> undef, float %B, i32 0
%broadcast.splat11 = shufflevector <4 x float> %broadcast.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%1 = getelementptr inbounds float, float* %A, i32 %index
%2 = bitcast float* %1 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %2, align 4
%3 = fsub fast <4 x float> %broadcast.splat11, %wide.load
%4 = getelementptr inbounds float, float* %C, i32 %index
%5 = bitcast float* %4 to <4 x float>*
store <4 x float> %3, <4 x float>* %5, align 4
%index.next = add i32 %index, 4
%6 = icmp eq i32 %index.next, %n
br i1 %6, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @test_fmas(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
; CHECK-LABEL: test_fmas:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB6_1: @ %vector.ph
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: vmov r12, s0
; CHECK-NEXT: .LBB6_2: @ %vector.body
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: vfmas.f32 q1, q0, r12
; CHECK-NEXT: vstrb.8 q1, [r2], #16
; CHECK-NEXT: bne .LBB6_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: bx lr
entry:
%0 = and i32 %n, 7
%cmp = icmp eq i32 %0, 0
tail call void @llvm.assume(i1 %cmp)
%cmp110 = icmp sgt i32 %n, 0
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%1 = getelementptr inbounds float, float* %A, i32 %index
%2 = bitcast float* %1 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %2, align 4
%3 = getelementptr inbounds float, float* %B, i32 %index
%4 = bitcast float* %3 to <4 x float>*
%wide.load12 = load <4 x float>, <4 x float>* %4, align 4
%5 = fmul fast <4 x float> %wide.load12, %wide.load
%6 = fadd fast <4 x float> %5, %broadcast.splat14
%7 = getelementptr inbounds float, float* %D, i32 %index
%8 = bitcast float* %7 to <4 x float>*
store <4 x float> %6, <4 x float>* %8, align 4
%index.next = add i32 %index, 4
%9 = icmp eq i32 %index.next, %n
br i1 %9, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @test_fmas_r(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
; CHECK-LABEL: test_fmas_r:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB7_1: @ %vector.ph
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: vmov r12, s0
; CHECK-NEXT: .LBB7_2: @ %vector.body
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: vfmas.f32 q1, q0, r12
; CHECK-NEXT: vstrb.8 q1, [r2], #16
; CHECK-NEXT: bne .LBB7_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: bx lr
entry:
%0 = and i32 %n, 7
%cmp = icmp eq i32 %0, 0
tail call void @llvm.assume(i1 %cmp)
%cmp110 = icmp sgt i32 %n, 0
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%1 = getelementptr inbounds float, float* %A, i32 %index
%2 = bitcast float* %1 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %2, align 4
%3 = getelementptr inbounds float, float* %B, i32 %index
%4 = bitcast float* %3 to <4 x float>*
%wide.load12 = load <4 x float>, <4 x float>* %4, align 4
%5 = fmul fast <4 x float> %wide.load12, %wide.load
%6 = fadd fast <4 x float> %broadcast.splat14, %5
%7 = getelementptr inbounds float, float* %D, i32 %index
%8 = bitcast float* %7 to <4 x float>*
store <4 x float> %6, <4 x float>* %8, align 4
%index.next = add i32 %index, 4
%9 = icmp eq i32 %index.next, %n
br i1 %9, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @test_fma(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
; CHECK-LABEL: test_fma:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB8_1: @ %vector.ph
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: vmov r12, s0
; CHECK-NEXT: .LBB8_2: @ %vector.body
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: vfma.f32 q1, q0, r12
; CHECK-NEXT: vstrb.8 q1, [r2], #16
; CHECK-NEXT: bne .LBB8_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: bx lr
entry:
%0 = and i32 %n, 7
%cmp = icmp eq i32 %0, 0
tail call void @llvm.assume(i1 %cmp)
%cmp110 = icmp sgt i32 %n, 0
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0
%broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%1 = getelementptr inbounds float, float* %A, i32 %index
%2 = bitcast float* %1 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %2, align 4
%3 = fmul fast <4 x float> %wide.load, %broadcast.splat13
%4 = getelementptr inbounds float, float* %B, i32 %index
%5 = bitcast float* %4 to <4 x float>*
%wide.load14 = load <4 x float>, <4 x float>* %5, align 4
%6 = fadd fast <4 x float> %3, %wide.load14
%7 = getelementptr inbounds float, float* %D, i32 %index
%8 = bitcast float* %7 to <4 x float>*
store <4 x float> %6, <4 x float>* %8, align 4
%index.next = add i32 %index, 4
%9 = icmp eq i32 %index.next, %n
br i1 %9, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @test_fma_r(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
; CHECK-LABEL: test_fma_r:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB9_1: @ %vector.ph
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: vmov r12, s0
; CHECK-NEXT: .LBB9_2: @ %vector.body
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: vfma.f32 q1, q0, r12
; CHECK-NEXT: vstrb.8 q1, [r2], #16
; CHECK-NEXT: bne .LBB9_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: bx lr
entry:
%0 = and i32 %n, 7
%cmp = icmp eq i32 %0, 0
tail call void @llvm.assume(i1 %cmp)
%cmp110 = icmp sgt i32 %n, 0
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0
%broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%1 = getelementptr inbounds float, float* %A, i32 %index
%2 = bitcast float* %1 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %2, align 4
%3 = fmul fast <4 x float> %broadcast.splat13, %wide.load
%4 = getelementptr inbounds float, float* %B, i32 %index
%5 = bitcast float* %4 to <4 x float>*
%wide.load14 = load <4 x float>, <4 x float>* %5, align 4
%6 = fadd fast <4 x float> %3, %wide.load14
%7 = getelementptr inbounds float, float* %D, i32 %index
%8 = bitcast float* %7 to <4 x float>*
store <4 x float> %6, <4 x float>* %8, align 4
%index.next = add i32 %index, 4
%9 = icmp eq i32 %index.next, %n
br i1 %9, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @test_fmss(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
; CHECK-LABEL: test_fmss:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB10_1: @ %vector.ph
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: vmov r12, s0
; CHECK-NEXT: vdup.32 q0, r12
; CHECK-NEXT: vneg.f32 q0, q0
; CHECK-NEXT: .LBB10_2: @ %vector.body
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
; CHECK-NEXT: vmov q3, q0
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: vfma.f32 q3, q2, q1
; CHECK-NEXT: vstrb.8 q3, [r2], #16
; CHECK-NEXT: bne .LBB10_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: bx lr
entry:
%0 = and i32 %n, 7
%cmp = icmp eq i32 %0, 0
tail call void @llvm.assume(i1 %cmp)
%cmp110 = icmp sgt i32 %n, 0
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%1 = getelementptr inbounds float, float* %A, i32 %index
%2 = bitcast float* %1 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %2, align 4
%3 = getelementptr inbounds float, float* %B, i32 %index
%4 = bitcast float* %3 to <4 x float>*
%wide.load12 = load <4 x float>, <4 x float>* %4, align 4
%5 = fmul fast <4 x float> %wide.load12, %wide.load
%6 = fsub fast <4 x float> %5, %broadcast.splat14
%7 = getelementptr inbounds float, float* %D, i32 %index
%8 = bitcast float* %7 to <4 x float>*
store <4 x float> %6, <4 x float>* %8, align 4
%index.next = add i32 %index, 4
%9 = icmp eq i32 %index.next, %n
br i1 %9, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @test_fmss_r(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
; CHECK-LABEL: test_fmss_r:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB11_1: @ %vector.ph
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: vmov r12, s0
; CHECK-NEXT: vdup.32 q0, r12
; CHECK-NEXT: .LBB11_2: @ %vector.body
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
; CHECK-NEXT: vmov q3, q0
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: vfms.f32 q3, q2, q1
; CHECK-NEXT: vstrb.8 q3, [r2], #16
; CHECK-NEXT: bne .LBB11_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: bx lr
entry:
%0 = and i32 %n, 7
%cmp = icmp eq i32 %0, 0
tail call void @llvm.assume(i1 %cmp)
%cmp110 = icmp sgt i32 %n, 0
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%broadcast.splatinsert13 = insertelement <4 x float> undef, float %C, i32 0
%broadcast.splat14 = shufflevector <4 x float> %broadcast.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%1 = getelementptr inbounds float, float* %A, i32 %index
%2 = bitcast float* %1 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %2, align 4
%3 = getelementptr inbounds float, float* %B, i32 %index
%4 = bitcast float* %3 to <4 x float>*
%wide.load12 = load <4 x float>, <4 x float>* %4, align 4
%5 = fmul fast <4 x float> %wide.load12, %wide.load
%6 = fsub fast <4 x float> %broadcast.splat14, %5
%7 = getelementptr inbounds float, float* %D, i32 %index
%8 = bitcast float* %7 to <4 x float>*
store <4 x float> %6, <4 x float>* %8, align 4
%index.next = add i32 %index, 4
%9 = icmp eq i32 %index.next, %n
br i1 %9, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @test_fms(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
; CHECK-LABEL: test_fms:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB12_1: @ %vector.ph
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: vmov r12, s0
; CHECK-NEXT: .LBB12_2: @ %vector.body
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: vneg.f32 q0, q0
; CHECK-NEXT: vfma.f32 q0, q1, r12
; CHECK-NEXT: vstrb.8 q0, [r2], #16
; CHECK-NEXT: bne .LBB12_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: bx lr
entry:
%0 = and i32 %n, 7
%cmp = icmp eq i32 %0, 0
tail call void @llvm.assume(i1 %cmp)
%cmp110 = icmp sgt i32 %n, 0
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0
%broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%1 = getelementptr inbounds float, float* %A, i32 %index
%2 = bitcast float* %1 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %2, align 4
%3 = fmul fast <4 x float> %wide.load, %broadcast.splat13
%4 = getelementptr inbounds float, float* %B, i32 %index
%5 = bitcast float* %4 to <4 x float>*
%wide.load14 = load <4 x float>, <4 x float>* %5, align 4
%6 = fsub fast <4 x float> %3, %wide.load14
%7 = getelementptr inbounds float, float* %D, i32 %index
%8 = bitcast float* %7 to <4 x float>*
store <4 x float> %6, <4 x float>* %8, align 4
%index.next = add i32 %index, 4
%9 = icmp eq i32 %index.next, %n
br i1 %9, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define arm_aapcs_vfpcc void @test_fms_r(float* noalias nocapture readonly %A, float* noalias nocapture readonly %B, float %C, float* noalias nocapture %D, i32 %n) {
; CHECK-LABEL: test_fms_r:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB13_1: @ %vector.ph
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: vmov r12, s0
; CHECK-NEXT: .LBB13_2: @ %vector.body
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: subs r3, #4
; CHECK-NEXT: vneg.f32 q0, q0
; CHECK-NEXT: vfma.f32 q0, q1, r12
; CHECK-NEXT: vstrb.8 q0, [r2], #16
; CHECK-NEXT: bne .LBB13_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: bx lr
entry:
%0 = and i32 %n, 7
%cmp = icmp eq i32 %0, 0
tail call void @llvm.assume(i1 %cmp)
%cmp110 = icmp sgt i32 %n, 0
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
vector.ph: ; preds = %entry
%broadcast.splatinsert12 = insertelement <4 x float> undef, float %C, i32 0
%broadcast.splat13 = shufflevector <4 x float> %broadcast.splatinsert12, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%1 = getelementptr inbounds float, float* %A, i32 %index
%2 = bitcast float* %1 to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %2, align 4
%3 = fmul fast <4 x float> %broadcast.splat13, %wide.load
%4 = getelementptr inbounds float, float* %B, i32 %index
%5 = bitcast float* %4 to <4 x float>*
%wide.load14 = load <4 x float>, <4 x float>* %5, align 4
%6 = fsub fast <4 x float> %3, %wide.load14
%7 = getelementptr inbounds float, float* %D, i32 %index
%8 = bitcast float* %7 to <4 x float>*
store <4 x float> %6, <4 x float>* %8, align 4
%index.next = add i32 %index, 4
%9 = icmp eq i32 %index.next, %n
br i1 %9, label %for.cond.cleanup, label %vector.body
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
define dso_local void @test_nested(float* noalias nocapture %pInT1, float* noalias nocapture readonly %pOutT1, float* noalias nocapture readonly %pPRT_in, float* noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l) local_unnamed_addr #0 {
2020-03-20 09:25:19 +01:00
; CHECK-LABEL: test_nested:
; CHECK: @ %bb.0: @ %for.body.us.preheader
; CHECK-NEXT: .save {r4, r5, r6, lr}
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: ldrd lr, r12, [sp, #16]
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: lsl.w r3, r12, #2
; CHECK-NEXT: .LBB14_1: @ %for.body.us
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB14_2 Depth 2
; CHECK-NEXT: ldr r4, [r1]
; CHECK-NEXT: mov r5, r2
; CHECK-NEXT: mov r6, r12
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: vdup.32 q0, r4
; CHECK-NEXT: mov r4, r0
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: .LBB14_2: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB14_1 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vldrw.u32 q1, [r5], #16
; CHECK-NEXT: vldrw.u32 q2, [r4]
; CHECK-NEXT: subs r6, #4
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: vfms.f32 q2, q1, q0
; CHECK-NEXT: vstrb.8 q2, [r4], #16
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: bne .LBB14_2
; CHECK-NEXT: @ %bb.3: @ %for.cond6.for.end_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB14_1 Depth=1
; CHECK-NEXT: add r0, r3
; CHECK-NEXT: add r2, r3
; CHECK-NEXT: adds r1, #4
; CHECK-NEXT: le lr, .LBB14_1
; CHECK-NEXT: @ %bb.4: @ %for.end14
; CHECK-NEXT: pop {r4, r5, r6, pc}
2020-03-20 09:25:19 +01:00
for.body.us.preheader:
%cmp = icmp sgt i32 %numRows, 0
tail call void @llvm.assume(i1 %cmp)
%cmp1 = icmp sgt i32 %numCols, 0
tail call void @llvm.assume(i1 %cmp1)
%rem = and i32 %numCols, 7
%cmp2 = icmp eq i32 %rem, 0
tail call void @llvm.assume(i1 %cmp2)
%cmp3 = icmp slt i32 %l, %numCols
tail call void @llvm.assume(i1 %cmp3)
br label %for.body.us
for.body.us: ; preds = %for.cond6.for.end_crit_edge.us, %for.body.us.preheader
%pInT1.addr.038.us = phi float* [ %scevgep40, %for.cond6.for.end_crit_edge.us ], [ %pInT1, %for.body.us.preheader ]
%i.037.us = phi i32 [ %inc13.us, %for.cond6.for.end_crit_edge.us ], [ 0, %for.body.us.preheader ]
%pOutT1.addr.036.us = phi float* [ %incdec.ptr.us, %for.cond6.for.end_crit_edge.us ], [ %pOutT1, %for.body.us.preheader ]
%pPRT_in.addr.035.us = phi float* [ %scevgep, %for.cond6.for.end_crit_edge.us ], [ %pPRT_in, %for.body.us.preheader ]
%scevgep = getelementptr float, float* %pPRT_in.addr.035.us, i32 %numCols
%0 = load float, float* %pOutT1.addr.036.us, align 4
%broadcast.splatinsert47 = insertelement <4 x float> undef, float %0, i32 0
%broadcast.splat48 = shufflevector <4 x float> %broadcast.splatinsert47, <4 x float> undef, <4 x i32> zeroinitializer
br label %vector.body
vector.body: ; preds = %vector.body, %for.body.us
%index = phi i32 [ 0, %for.body.us ], [ %index.next, %vector.body ]
%next.gep = getelementptr float, float* %pInT1.addr.038.us, i32 %index
%next.gep45 = getelementptr float, float* %pPRT_in.addr.035.us, i32 %index
%1 = bitcast float* %next.gep to <4 x float>*
%wide.load = load <4 x float>, <4 x float>* %1, align 4
%2 = bitcast float* %next.gep45 to <4 x float>*
%wide.load46 = load <4 x float>, <4 x float>* %2, align 4
%3 = fmul fast <4 x float> %wide.load46, %broadcast.splat48
%4 = fsub fast <4 x float> %wide.load, %3
store <4 x float> %4, <4 x float>* %1, align 4
%index.next = add i32 %index, 4
%5 = icmp eq i32 %index.next, %numCols
br i1 %5, label %for.cond6.for.end_crit_edge.us, label %vector.body
for.cond6.for.end_crit_edge.us: ; preds = %vector.body
%incdec.ptr.us = getelementptr inbounds float, float* %pOutT1.addr.036.us, i32 1
%scevgep40 = getelementptr float, float* %pInT1.addr.038.us, i32 %numCols
%inc13.us = add nuw nsw i32 %i.037.us, 1
%exitcond41 = icmp eq i32 %inc13.us, %numRows
br i1 %exitcond41, label %for.end14, label %for.body.us
for.end14: ; preds = %for.cond6.for.end_crit_edge.us
ret void
}
%struct.arm_fir_instance_f32 = type { i16, float*, float* }
define void @arm_fir_f32_1_4_mve(%struct.arm_fir_instance_f32* nocapture readonly %S, float* nocapture readonly %pSrc, float* %pDst, i32 %blockSize) {
; CHECK-LABEL: arm_fir_f32_1_4_mve:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, #8
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: ldrh.w r10, [r0]
; CHECK-NEXT: mov r11, r1
; CHECK-NEXT: ldr.w r12, [r0, #4]
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: sub.w r1, r10, #1
; CHECK-NEXT: cmp r1, #3
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: bhi .LBB15_6
; CHECK-NEXT: @ %bb.1: @ %if.then
; CHECK-NEXT: ldr r4, [r0, #8]
; CHECK-NEXT: ldrd r7, r6, [r4]
; CHECK-NEXT: ldrd r5, r8, [r4, #8]
; CHECK-NEXT: add.w r4, r12, r1, lsl #2
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: lsrs r1, r3, #2
; CHECK-NEXT: wls lr, r1, .LBB15_5
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
; CHECK-NEXT: bic r1, r3, #3
; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: add.w r9, r12, #4
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: mov r1, r11
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: .LBB15_3: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vstrb.8 q0, [r4], #16
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vldrw.u32 q0, [r9, #-4]
; CHECK-NEXT: vldrw.u32 q1, [r9], #16
; CHECK-NEXT: vmul.f32 q0, q0, r7
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vldrw.u32 q2, [r9, #-8]
; CHECK-NEXT: vfma.f32 q0, q1, r6
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vldrw.u32 q1, [r9, #-12]
; CHECK-NEXT: vfma.f32 q0, q1, r5
; CHECK-NEXT: vfma.f32 q0, q2, r8
; CHECK-NEXT: vstrb.8 q0, [r2], #16
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: le lr, .LBB15_3
; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit
; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: add.w r12, r12, r1, lsl #2
; CHECK-NEXT: add.w r11, r11, r1, lsl #2
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: .LBB15_5: @ %while.end
; CHECK-NEXT: and r1, r3, #3
; CHECK-NEXT: vldrw.u32 q0, [r11]
; CHECK-NEXT: vctp.32 r1
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q0, [r4]
; CHECK-NEXT: vldrw.u32 q0, [r12]
; CHECK-NEXT: vldrw.u32 q1, [r12, #4]
; CHECK-NEXT: vmul.f32 q0, q0, r7
; CHECK-NEXT: vfma.f32 q0, q1, r6
; CHECK-NEXT: vldrw.u32 q1, [r12, #8]
; CHECK-NEXT: vfma.f32 q0, q1, r5
; CHECK-NEXT: vldrw.u32 q1, [r12, #12]
; CHECK-NEXT: vfma.f32 q0, q1, r8
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q0, [r2]
; CHECK-NEXT: ldr.w r12, [r0, #4]
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: .LBB15_6: @ %if.end
; CHECK-NEXT: add.w r0, r12, r3, lsl #2
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: lsr.w r1, r10, #2
; CHECK-NEXT: wls lr, r1, .LBB15_10
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: bic r2, r10, #3
; CHECK-NEXT: adds r1, r2, r3
; CHECK-NEXT: mov r3, r12
; CHECK-NEXT: add.w r1, r12, r1, lsl #2
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: .LBB15_8: @ %while.body51
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
; CHECK-NEXT: vstrb.8 q0, [r3], #16
; CHECK-NEXT: le lr, .LBB15_8
; CHECK-NEXT: @ %bb.9: @ %while.end55.loopexit
; CHECK-NEXT: add.w r12, r12, r2, lsl #2
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: mov r0, r1
; CHECK-NEXT: .LBB15_10: @ %while.end55
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: ands r1, r10, #3
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: beq .LBB15_12
; CHECK-NEXT: @ %bb.11: @ %if.then59
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vctp.32 r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrwt.32 q0, [r12]
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: .LBB15_12: @ %if.end61
; CHECK-NEXT: add sp, #8
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
2020-03-20 09:25:19 +01:00
entry:
%pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1
%0 = load float*, float** %pState1, align 4
%pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 2
%1 = load float*, float** %pCoeffs2, align 4
%numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 0
%2 = load i16, i16* %numTaps3, align 4
%conv = zext i16 %2 to i32
%sub = add nsw i32 %conv, -1
%cmp = icmp ult i32 %sub, 4
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
%arrayidx = getelementptr inbounds float, float* %0, i32 %sub
%incdec.ptr = getelementptr inbounds float, float* %1, i32 1
%3 = load float, float* %1, align 4
%incdec.ptr6 = getelementptr inbounds float, float* %1, i32 2
%4 = load float, float* %incdec.ptr, align 4
%incdec.ptr7 = getelementptr inbounds float, float* %1, i32 3
%5 = load float, float* %incdec.ptr6, align 4
%6 = load float, float* %incdec.ptr7, align 4
%shr = lshr i32 %blockSize, 2
%cmp9146 = icmp eq i32 %shr, 0
%.pre161 = insertelement <4 x float> undef, float %3, i32 0
%.pre162 = shufflevector <4 x float> %.pre161, <4 x float> undef, <4 x i32> zeroinitializer
%.pre163 = insertelement <4 x float> undef, float %4, i32 0
%.pre164 = shufflevector <4 x float> %.pre163, <4 x float> undef, <4 x i32> zeroinitializer
%.pre165 = insertelement <4 x float> undef, float %5, i32 0
%.pre166 = shufflevector <4 x float> %.pre165, <4 x float> undef, <4 x i32> zeroinitializer
%.pre167 = insertelement <4 x float> undef, float %6, i32 0
%.pre168 = shufflevector <4 x float> %.pre167, <4 x float> undef, <4 x i32> zeroinitializer
br i1 %cmp9146, label %while.end, label %while.body.lr.ph
while.body.lr.ph: ; preds = %if.then
%7 = and i32 %blockSize, -4
%scevgep158 = getelementptr float, float* %pDst, i32 %7
br label %while.body
while.body: ; preds = %while.body.lr.ph, %while.body
%pStateCur.0151 = phi float* [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.body ]
%pSamples.0150 = phi float* [ %0, %while.body.lr.ph ], [ %add.ptr24, %while.body ]
%pOutput.0149 = phi float* [ %pDst, %while.body.lr.ph ], [ %add.ptr23, %while.body ]
%pTempSrc.0148 = phi float* [ %pSrc, %while.body.lr.ph ], [ %add.ptr11, %while.body ]
%blkCnt.0147 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec, %while.body ]
%8 = bitcast float* %pTempSrc.0148 to <4 x float>*
%9 = load <4 x float>, <4 x float>* %8, align 4
%10 = bitcast float* %pStateCur.0151 to <4 x float>*
store <4 x float> %9, <4 x float>* %10, align 4
%add.ptr = getelementptr inbounds float, float* %pStateCur.0151, i32 4
%add.ptr11 = getelementptr inbounds float, float* %pTempSrc.0148, i32 4
%11 = bitcast float* %pSamples.0150 to <4 x float>*
%12 = load <4 x float>, <4 x float>* %11, align 4
%13 = fmul fast <4 x float> %12, %.pre162
%arrayidx12 = getelementptr inbounds float, float* %pSamples.0150, i32 1
%14 = bitcast float* %arrayidx12 to <4 x float>*
%15 = load <4 x float>, <4 x float>* %14, align 4
%mul = fmul fast <4 x float> %15, %.pre164
%add = fadd fast <4 x float> %mul, %13
%arrayidx13 = getelementptr inbounds float, float* %pSamples.0150, i32 2
%16 = bitcast float* %arrayidx13 to <4 x float>*
%17 = load <4 x float>, <4 x float>* %16, align 4
%mul16 = fmul fast <4 x float> %17, %.pre166
%add17 = fadd fast <4 x float> %add, %mul16
%arrayidx18 = getelementptr inbounds float, float* %pSamples.0150, i32 3
%18 = bitcast float* %arrayidx18 to <4 x float>*
%19 = load <4 x float>, <4 x float>* %18, align 4
%mul21 = fmul fast <4 x float> %19, %.pre168
%add22 = fadd fast <4 x float> %add17, %mul21
%20 = bitcast float* %pOutput.0149 to <4 x float>*
store <4 x float> %add22, <4 x float>* %20, align 4
%add.ptr23 = getelementptr inbounds float, float* %pOutput.0149, i32 4
%add.ptr24 = getelementptr inbounds float, float* %pSamples.0150, i32 4
%dec = add nsw i32 %blkCnt.0147, -1
%cmp9 = icmp eq i32 %dec, 0
br i1 %cmp9, label %while.end.loopexit, label %while.body
while.end.loopexit: ; preds = %while.body
%scevgep157 = getelementptr float, float* %pSrc, i32 %7
%scevgep159 = getelementptr float, float* %0, i32 %7
br label %while.end
while.end: ; preds = %if.then, %while.end.loopexit
%pTempSrc.0.lcssa = phi float* [ %scevgep157, %while.end.loopexit ], [ %pSrc, %if.then ]
%pOutput.0.lcssa = phi float* [ %scevgep158, %while.end.loopexit ], [ %pDst, %if.then ]
%pSamples.0.lcssa = phi float* [ %scevgep159, %while.end.loopexit ], [ %0, %if.then ]
%pStateCur.0.lcssa = phi float* [ %add.ptr, %while.end.loopexit ], [ %arrayidx, %if.then ]
%and = and i32 %blockSize, 3
%21 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %and)
%22 = bitcast float* %pTempSrc.0.lcssa to <4 x float>*
%23 = load <4 x float>, <4 x float>* %22, align 4
%24 = bitcast float* %pStateCur.0.lcssa to <4 x float>*
tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %23, <4 x float>* %24, i32 4, <4 x i1> %21)
%25 = bitcast float* %pSamples.0.lcssa to <4 x float>*
%26 = load <4 x float>, <4 x float>* %25, align 4
%27 = fmul fast <4 x float> %26, %.pre162
%arrayidx29 = getelementptr inbounds float, float* %pSamples.0.lcssa, i32 1
%28 = bitcast float* %arrayidx29 to <4 x float>*
%29 = load <4 x float>, <4 x float>* %28, align 4
%mul32 = fmul fast <4 x float> %29, %.pre164
%add33 = fadd fast <4 x float> %mul32, %27
%arrayidx34 = getelementptr inbounds float, float* %pSamples.0.lcssa, i32 2
%30 = bitcast float* %arrayidx34 to <4 x float>*
%31 = load <4 x float>, <4 x float>* %30, align 4
%mul37 = fmul fast <4 x float> %31, %.pre166
%add38 = fadd fast <4 x float> %add33, %mul37
%arrayidx39 = getelementptr inbounds float, float* %pSamples.0.lcssa, i32 3
%32 = bitcast float* %arrayidx39 to <4 x float>*
%33 = load <4 x float>, <4 x float>* %32, align 4
%mul42 = fmul fast <4 x float> %33, %.pre168
%add43 = fadd fast <4 x float> %add38, %mul42
%34 = bitcast float* %pOutput.0.lcssa to <4 x float>*
tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %add43, <4 x float>* %34, i32 4, <4 x i1> %21)
%.pre = load float*, float** %pState1, align 4
br label %if.end
if.end: ; preds = %while.end, %entry
%35 = phi float* [ %.pre, %while.end ], [ %0, %entry ]
%arrayidx45 = getelementptr inbounds float, float* %35, i32 %blockSize
%shr47 = lshr i32 %conv, 2
%cmp49141 = icmp eq i32 %shr47, 0
br i1 %cmp49141, label %while.end55, label %while.body51.preheader
while.body51.preheader: ; preds = %if.end
%36 = and i32 %conv, 65532
%37 = add i32 %36, %blockSize
%scevgep = getelementptr float, float* %35, i32 %37
br label %while.body51
while.body51: ; preds = %while.body51.preheader, %while.body51
%pTempSrc.1144 = phi float* [ %add.ptr52, %while.body51 ], [ %arrayidx45, %while.body51.preheader ]
%pTempDest.0143 = phi float* [ %add.ptr53, %while.body51 ], [ %35, %while.body51.preheader ]
%blkCnt.1142 = phi i32 [ %dec54, %while.body51 ], [ %shr47, %while.body51.preheader ]
%38 = bitcast float* %pTempSrc.1144 to <4 x float>*
%39 = load <4 x float>, <4 x float>* %38, align 4
%40 = bitcast float* %pTempDest.0143 to <4 x float>*
store <4 x float> %39, <4 x float>* %40, align 4
%add.ptr52 = getelementptr inbounds float, float* %pTempSrc.1144, i32 4
%add.ptr53 = getelementptr inbounds float, float* %pTempDest.0143, i32 4
%dec54 = add nsw i32 %blkCnt.1142, -1
%cmp49 = icmp eq i32 %dec54, 0
br i1 %cmp49, label %while.end55.loopexit, label %while.body51
while.end55.loopexit: ; preds = %while.body51
%scevgep156 = getelementptr float, float* %35, i32 %36
br label %while.end55
while.end55: ; preds = %while.end55.loopexit, %if.end
%pTempDest.0.lcssa = phi float* [ %35, %if.end ], [ %scevgep156, %while.end55.loopexit ]
%pTempSrc.1.lcssa = phi float* [ %arrayidx45, %if.end ], [ %scevgep, %while.end55.loopexit ]
%and56 = and i32 %conv, 3
%cmp57 = icmp eq i32 %and56, 0
br i1 %cmp57, label %if.end61, label %if.then59
if.then59: ; preds = %while.end55
%41 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %and56)
%42 = bitcast float* %pTempSrc.1.lcssa to <4 x float>*
%43 = load <4 x float>, <4 x float>* %42, align 4
%44 = bitcast float* %pTempDest.0.lcssa to <4 x float>*
tail call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %43, <4 x float>* %44, i32 4, <4 x i1> %41)
br label %if.end61
if.end61: ; preds = %while.end55, %if.then59
ret void
}
define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, float* nocapture readonly %pSrc, float* nocapture %pDst, i32 %blockSize) {
; CHECK-LABEL: fir:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: .pad #32
; CHECK-NEXT: sub sp, #32
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: cmp r3, #8
; CHECK-NEXT: blo.w .LBB16_12
; CHECK-NEXT: @ %bb.1: @ %entry
; CHECK-NEXT: lsrs.w r12, r3, #2
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: beq.w .LBB16_12
; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
; CHECK-NEXT: ldrh r6, [r0]
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: movs r5, #1
; CHECK-NEXT: ldrd r4, r10, [r0, #4]
; CHECK-NEXT: sub.w r0, r6, #8
; CHECK-NEXT: add.w r3, r0, r0, lsr #29
; CHECK-NEXT: and r0, r0, #7
; CHECK-NEXT: asrs r7, r3, #3
; CHECK-NEXT: cmp r7, #1
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: it gt
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: asrgt r5, r3, #3
; CHECK-NEXT: add.w r3, r4, r6, lsl #2
; CHECK-NEXT: sub.w r9, r3, #4
; CHECK-NEXT: rsbs r3, r6, #0
; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: add.w r3, r10, #32
; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: str r6, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: b .LBB16_5
; CHECK-NEXT: .LBB16_3: @ %for.end
; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
; CHECK-NEXT: ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload
; CHECK-NEXT: wls lr, r0, .LBB16_4
; CHECK-NEXT: b .LBB16_9
; CHECK-NEXT: .LBB16_4: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: subs.w r12, r12, #1
; CHECK-NEXT: vstrb.8 q0, [r2], #16
; CHECK-NEXT: add.w r0, r4, r0, lsl #2
; CHECK-NEXT: add.w r4, r0, #16
; CHECK-NEXT: beq .LBB16_12
; CHECK-NEXT: .LBB16_5: @ %while.body
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB16_7 Depth 2
; CHECK-NEXT: @ Child Loop BB16_10 Depth 2
; CHECK-NEXT: add.w lr, r10, #8
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: ldrd r3, r7, [r10]
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: ldm.w lr, {r0, r5, r6, lr}
; CHECK-NEXT: ldrd r11, r8, [r10, #24]
; CHECK-NEXT: vstrb.8 q0, [r9], #16
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vldrw.u32 q0, [r4], #32
; CHECK-NEXT: strd r9, r1, [sp, #24] @ 8-byte Folded Spill
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vldrw.u32 q1, [r4, #-28]
; CHECK-NEXT: vmul.f32 q0, q0, r3
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vldrw.u32 q6, [r4, #-24]
; CHECK-NEXT: vldrw.u32 q4, [r4, #-20]
; CHECK-NEXT: vfma.f32 q0, q1, r7
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vldrw.u32 q5, [r4, #-16]
; CHECK-NEXT: vfma.f32 q0, q6, r0
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vldrw.u32 q2, [r4, #-12]
; CHECK-NEXT: vfma.f32 q0, q4, r5
; CHECK-NEXT: vldrw.u32 q3, [r4, #-8]
; CHECK-NEXT: vfma.f32 q0, q5, r6
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: vfma.f32 q0, q2, lr
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vldrw.u32 q1, [r4, #-4]
; CHECK-NEXT: vfma.f32 q0, q3, r11
; CHECK-NEXT: cmp r0, #16
; CHECK-NEXT: vfma.f32 q0, q1, r8
; CHECK-NEXT: blo .LBB16_8
; CHECK-NEXT: @ %bb.6: @ %for.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 16:57:58 +01:00
; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: dls lr, r0
; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: .LBB16_7: @ %for.body
; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: ldm.w r7, {r0, r3, r5, r6, r8, r11}
; CHECK-NEXT: vldrw.u32 q1, [r4], #32
; CHECK-NEXT: vldrw.u32 q6, [r4, #-24]
; CHECK-NEXT: vldrw.u32 q4, [r4, #-20]
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: vfma.f32 q0, q1, r0
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vldrw.u32 q1, [r4, #-28]
; CHECK-NEXT: vldrw.u32 q5, [r4, #-16]
; CHECK-NEXT: vldrw.u32 q2, [r4, #-12]
; CHECK-NEXT: vfma.f32 q0, q1, r3
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: ldrd r9, r1, [r7, #24]
; CHECK-NEXT: vfma.f32 q0, q6, r5
; CHECK-NEXT: vldrw.u32 q3, [r4, #-8]
; CHECK-NEXT: vfma.f32 q0, q4, r6
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vldrw.u32 q1, [r4, #-4]
; CHECK-NEXT: vfma.f32 q0, q5, r8
; CHECK-NEXT: adds r7, #32
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vfma.f32 q0, q2, r11
; CHECK-NEXT: vfma.f32 q0, q3, r9
; CHECK-NEXT: vfma.f32 q0, q1, r1
; CHECK-NEXT: le lr, .LBB16_7
; CHECK-NEXT: b .LBB16_3
; CHECK-NEXT: .LBB16_8: @ in Loop: Header=BB16_5 Depth=1
; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: b .LBB16_3
; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: mov r3, r4
; CHECK-NEXT: .LBB16_10: @ %while.body76
; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldr r0, [r7], #4
; CHECK-NEXT: vldrw.u32 q1, [r3], #4
; CHECK-NEXT: vfma.f32 q0, q1, r0
; CHECK-NEXT: le lr, .LBB16_10
; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit
; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: add.w r4, r4, r0, lsl #2
; CHECK-NEXT: b .LBB16_4
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: .LBB16_12: @ %if.end
; CHECK-NEXT: add sp, #32
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
2020-03-20 09:25:19 +01:00
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
%pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1
%0 = load float*, float** %pState1, align 4
%pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 2
%1 = load float*, float** %pCoeffs2, align 4
%numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 0
%2 = load i16, i16* %numTaps3, align 4
%conv = zext i16 %2 to i32
%cmp = icmp ugt i32 %blockSize, 7
br i1 %cmp, label %if.then, label %if.end
if.then: ; preds = %entry
%shr = lshr i32 %blockSize, 2
%cmp5217 = icmp eq i32 %shr, 0
br i1 %cmp5217, label %if.end, label %while.body.lr.ph
while.body.lr.ph: ; preds = %if.then
%sub = add nsw i32 %conv, -1
%arrayidx = getelementptr inbounds float, float* %0, i32 %sub
%incdec.ptr = getelementptr inbounds float, float* %1, i32 1
%incdec.ptr7 = getelementptr inbounds float, float* %1, i32 2
%incdec.ptr8 = getelementptr inbounds float, float* %1, i32 3
%incdec.ptr9 = getelementptr inbounds float, float* %1, i32 4
%incdec.ptr10 = getelementptr inbounds float, float* %1, i32 5
%incdec.ptr11 = getelementptr inbounds float, float* %1, i32 6
%incdec.ptr12 = getelementptr inbounds float, float* %1, i32 7
%sub37 = add nsw i32 %conv, -8
%div = sdiv i32 %sub37, 8
%pCoeffsCur.0199 = getelementptr inbounds float, float* %1, i32 8
%cmp38201 = icmp ugt i16 %2, 15
%and = and i32 %sub37, 7
%cmp74210 = icmp eq i32 %and, 0
%idx.neg = sub nsw i32 0, %conv
%3 = icmp sgt i32 %div, 1
%smax = select i1 %3, i32 %div, i32 1
br label %while.body
while.body: ; preds = %while.body.lr.ph, %while.end
%blkCnt.0222 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec84, %while.end ]
%pStateCur.0221 = phi float* [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.end ]
%pSamples.0220 = phi float* [ %0, %while.body.lr.ph ], [ %add.ptr83, %while.end ]
%pTempSrc.0219 = phi float* [ %pSrc, %while.body.lr.ph ], [ %add.ptr14, %while.end ]
%pOutput.0218 = phi float* [ %pDst, %while.body.lr.ph ], [ %add.ptr81, %while.end ]
%4 = load float, float* %1, align 4
%5 = load float, float* %incdec.ptr, align 4
%6 = load float, float* %incdec.ptr7, align 4
%7 = load float, float* %incdec.ptr8, align 4
%8 = load float, float* %incdec.ptr9, align 4
%9 = load float, float* %incdec.ptr10, align 4
%10 = load float, float* %incdec.ptr11, align 4
%11 = load float, float* %incdec.ptr12, align 4
%12 = bitcast float* %pTempSrc.0219 to <4 x float>*
%13 = load <4 x float>, <4 x float>* %12, align 4
%14 = bitcast float* %pStateCur.0221 to <4 x float>*
store <4 x float> %13, <4 x float>* %14, align 4
%add.ptr = getelementptr inbounds float, float* %pStateCur.0221, i32 4
%add.ptr14 = getelementptr inbounds float, float* %pTempSrc.0219, i32 4
%15 = bitcast float* %pSamples.0220 to <4 x float>*
%16 = load <4 x float>, <4 x float>* %15, align 4
%.splatinsert = insertelement <4 x float> undef, float %4, i32 0
%.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
%17 = fmul fast <4 x float> %16, %.splat
%arrayidx15 = getelementptr inbounds float, float* %pSamples.0220, i32 1
%18 = bitcast float* %arrayidx15 to <4 x float>*
%19 = load <4 x float>, <4 x float>* %18, align 4
%.splatinsert16 = insertelement <4 x float> undef, float %5, i32 0
%.splat17 = shufflevector <4 x float> %.splatinsert16, <4 x float> undef, <4 x i32> zeroinitializer
%20 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %19, <4 x float> %.splat17, <4 x float> %17)
%arrayidx18 = getelementptr inbounds float, float* %pSamples.0220, i32 2
%21 = bitcast float* %arrayidx18 to <4 x float>*
%22 = load <4 x float>, <4 x float>* %21, align 4
%.splatinsert19 = insertelement <4 x float> undef, float %6, i32 0
%.splat20 = shufflevector <4 x float> %.splatinsert19, <4 x float> undef, <4 x i32> zeroinitializer
%23 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %22, <4 x float> %.splat20, <4 x float> %20)
%arrayidx21 = getelementptr inbounds float, float* %pSamples.0220, i32 3
%24 = bitcast float* %arrayidx21 to <4 x float>*
%25 = load <4 x float>, <4 x float>* %24, align 4
%.splatinsert22 = insertelement <4 x float> undef, float %7, i32 0
%.splat23 = shufflevector <4 x float> %.splatinsert22, <4 x float> undef, <4 x i32> zeroinitializer
%26 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %25, <4 x float> %.splat23, <4 x float> %23)
%arrayidx24 = getelementptr inbounds float, float* %pSamples.0220, i32 4
%27 = bitcast float* %arrayidx24 to <4 x float>*
%28 = load <4 x float>, <4 x float>* %27, align 4
%.splatinsert25 = insertelement <4 x float> undef, float %8, i32 0
%.splat26 = shufflevector <4 x float> %.splatinsert25, <4 x float> undef, <4 x i32> zeroinitializer
%29 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %28, <4 x float> %.splat26, <4 x float> %26)
%arrayidx27 = getelementptr inbounds float, float* %pSamples.0220, i32 5
%30 = bitcast float* %arrayidx27 to <4 x float>*
%31 = load <4 x float>, <4 x float>* %30, align 4
%.splatinsert28 = insertelement <4 x float> undef, float %9, i32 0
%.splat29 = shufflevector <4 x float> %.splatinsert28, <4 x float> undef, <4 x i32> zeroinitializer
%32 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %31, <4 x float> %.splat29, <4 x float> %29)
%arrayidx30 = getelementptr inbounds float, float* %pSamples.0220, i32 6
%33 = bitcast float* %arrayidx30 to <4 x float>*
%34 = load <4 x float>, <4 x float>* %33, align 4
%.splatinsert31 = insertelement <4 x float> undef, float %10, i32 0
%.splat32 = shufflevector <4 x float> %.splatinsert31, <4 x float> undef, <4 x i32> zeroinitializer
%35 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %34, <4 x float> %.splat32, <4 x float> %32)
%arrayidx33 = getelementptr inbounds float, float* %pSamples.0220, i32 7
%36 = bitcast float* %arrayidx33 to <4 x float>*
%37 = load <4 x float>, <4 x float>* %36, align 4
%.splatinsert34 = insertelement <4 x float> undef, float %11, i32 0
%.splat35 = shufflevector <4 x float> %.splatinsert34, <4 x float> undef, <4 x i32> zeroinitializer
%38 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %37, <4 x float> %.splat35, <4 x float> %35)
%pSamples.1200 = getelementptr inbounds float, float* %pSamples.0220, i32 8
br i1 %cmp38201, label %for.body, label %for.end
for.body: ; preds = %while.body, %for.body
%pSamples.1207 = phi float* [ %pSamples.1, %for.body ], [ %pSamples.1200, %while.body ]
%pCoeffsCur.0206 = phi float* [ %pCoeffsCur.0, %for.body ], [ %pCoeffsCur.0199, %while.body ]
%.pn205 = phi float* [ %pCoeffsCur.0206, %for.body ], [ %1, %while.body ]
%i.0204 = phi i32 [ %inc, %for.body ], [ 0, %while.body ]
%vecAcc0.0203 = phi <4 x float> [ %70, %for.body ], [ %38, %while.body ]
%pSamples.0.pn202 = phi float* [ %pSamples.1207, %for.body ], [ %pSamples.0220, %while.body ]
%incdec.ptr40 = getelementptr inbounds float, float* %.pn205, i32 9
%39 = load float, float* %pCoeffsCur.0206, align 4
%incdec.ptr41 = getelementptr inbounds float, float* %.pn205, i32 10
%40 = load float, float* %incdec.ptr40, align 4
%incdec.ptr42 = getelementptr inbounds float, float* %.pn205, i32 11
%41 = load float, float* %incdec.ptr41, align 4
%incdec.ptr43 = getelementptr inbounds float, float* %.pn205, i32 12
%42 = load float, float* %incdec.ptr42, align 4
%incdec.ptr44 = getelementptr inbounds float, float* %.pn205, i32 13
%43 = load float, float* %incdec.ptr43, align 4
%incdec.ptr45 = getelementptr inbounds float, float* %.pn205, i32 14
%44 = load float, float* %incdec.ptr44, align 4
%incdec.ptr46 = getelementptr inbounds float, float* %.pn205, i32 15
%45 = load float, float* %incdec.ptr45, align 4
%46 = load float, float* %incdec.ptr46, align 4
%47 = bitcast float* %pSamples.1207 to <4 x float>*
%48 = load <4 x float>, <4 x float>* %47, align 4
%.splatinsert48 = insertelement <4 x float> undef, float %39, i32 0
%.splat49 = shufflevector <4 x float> %.splatinsert48, <4 x float> undef, <4 x i32> zeroinitializer
%49 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %48, <4 x float> %.splat49, <4 x float> %vecAcc0.0203)
%arrayidx50 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 9
%50 = bitcast float* %arrayidx50 to <4 x float>*
%51 = load <4 x float>, <4 x float>* %50, align 4
%.splatinsert51 = insertelement <4 x float> undef, float %40, i32 0
%.splat52 = shufflevector <4 x float> %.splatinsert51, <4 x float> undef, <4 x i32> zeroinitializer
%52 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %51, <4 x float> %.splat52, <4 x float> %49)
%arrayidx53 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 10
%53 = bitcast float* %arrayidx53 to <4 x float>*
%54 = load <4 x float>, <4 x float>* %53, align 4
%.splatinsert54 = insertelement <4 x float> undef, float %41, i32 0
%.splat55 = shufflevector <4 x float> %.splatinsert54, <4 x float> undef, <4 x i32> zeroinitializer
%55 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %54, <4 x float> %.splat55, <4 x float> %52)
%arrayidx56 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 11
%56 = bitcast float* %arrayidx56 to <4 x float>*
%57 = load <4 x float>, <4 x float>* %56, align 4
%.splatinsert57 = insertelement <4 x float> undef, float %42, i32 0
%.splat58 = shufflevector <4 x float> %.splatinsert57, <4 x float> undef, <4 x i32> zeroinitializer
%58 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %57, <4 x float> %.splat58, <4 x float> %55)
%arrayidx59 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 12
%59 = bitcast float* %arrayidx59 to <4 x float>*
%60 = load <4 x float>, <4 x float>* %59, align 4
%.splatinsert60 = insertelement <4 x float> undef, float %43, i32 0
%.splat61 = shufflevector <4 x float> %.splatinsert60, <4 x float> undef, <4 x i32> zeroinitializer
%61 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %60, <4 x float> %.splat61, <4 x float> %58)
%arrayidx62 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 13
%62 = bitcast float* %arrayidx62 to <4 x float>*
%63 = load <4 x float>, <4 x float>* %62, align 4
%.splatinsert63 = insertelement <4 x float> undef, float %44, i32 0
%.splat64 = shufflevector <4 x float> %.splatinsert63, <4 x float> undef, <4 x i32> zeroinitializer
%64 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %63, <4 x float> %.splat64, <4 x float> %61)
%arrayidx65 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 14
%65 = bitcast float* %arrayidx65 to <4 x float>*
%66 = load <4 x float>, <4 x float>* %65, align 4
%.splatinsert66 = insertelement <4 x float> undef, float %45, i32 0
%.splat67 = shufflevector <4 x float> %.splatinsert66, <4 x float> undef, <4 x i32> zeroinitializer
%67 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %66, <4 x float> %.splat67, <4 x float> %64)
%arrayidx68 = getelementptr inbounds float, float* %pSamples.0.pn202, i32 15
%68 = bitcast float* %arrayidx68 to <4 x float>*
%69 = load <4 x float>, <4 x float>* %68, align 4
%.splatinsert69 = insertelement <4 x float> undef, float %46, i32 0
%.splat70 = shufflevector <4 x float> %.splatinsert69, <4 x float> undef, <4 x i32> zeroinitializer
%70 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %69, <4 x float> %.splat70, <4 x float> %67)
%inc = add nuw nsw i32 %i.0204, 1
%pCoeffsCur.0 = getelementptr inbounds float, float* %pCoeffsCur.0206, i32 8
%pSamples.1 = getelementptr inbounds float, float* %pSamples.1207, i32 8
%exitcond = icmp eq i32 %inc, %smax
br i1 %exitcond, label %for.end, label %for.body
for.end: ; preds = %for.body, %while.body
%vecAcc0.0.lcssa = phi <4 x float> [ %38, %while.body ], [ %70, %for.body ]
%pCoeffsCur.0.lcssa = phi float* [ %pCoeffsCur.0199, %while.body ], [ %pCoeffsCur.0, %for.body ]
%pSamples.1.lcssa = phi float* [ %pSamples.1200, %while.body ], [ %pSamples.1, %for.body ]
br i1 %cmp74210, label %while.end, label %while.body76
while.body76: ; preds = %for.end, %while.body76
%pCoeffsCur.1214 = phi float* [ %incdec.ptr77, %while.body76 ], [ %pCoeffsCur.0.lcssa, %for.end ]
%vecAcc0.1213 = phi <4 x float> [ %74, %while.body76 ], [ %vecAcc0.0.lcssa, %for.end ]
%numCnt.0212 = phi i32 [ %dec, %while.body76 ], [ %and, %for.end ]
%pSamples.2211 = phi float* [ %incdec.ptr80, %while.body76 ], [ %pSamples.1.lcssa, %for.end ]
%incdec.ptr77 = getelementptr inbounds float, float* %pCoeffsCur.1214, i32 1
%71 = load float, float* %pCoeffsCur.1214, align 4
%72 = bitcast float* %pSamples.2211 to <4 x float>*
%73 = load <4 x float>, <4 x float>* %72, align 4
%.splatinsert78 = insertelement <4 x float> undef, float %71, i32 0
%.splat79 = shufflevector <4 x float> %.splatinsert78, <4 x float> undef, <4 x i32> zeroinitializer
%74 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %73, <4 x float> %.splat79, <4 x float> %vecAcc0.1213)
%incdec.ptr80 = getelementptr inbounds float, float* %pSamples.2211, i32 1
%dec = add nsw i32 %numCnt.0212, -1
%cmp74 = icmp sgt i32 %numCnt.0212, 1
br i1 %cmp74, label %while.body76, label %while.end.loopexit
while.end.loopexit: ; preds = %while.body76
%scevgep = getelementptr float, float* %pSamples.1.lcssa, i32 %and
br label %while.end
while.end: ; preds = %while.end.loopexit, %for.end
%pSamples.2.lcssa = phi float* [ %pSamples.1.lcssa, %for.end ], [ %scevgep, %while.end.loopexit ]
%vecAcc0.1.lcssa = phi <4 x float> [ %vecAcc0.0.lcssa, %for.end ], [ %74, %while.end.loopexit ]
%75 = bitcast float* %pOutput.0218 to <4 x float>*
store <4 x float> %vecAcc0.1.lcssa, <4 x float>* %75, align 4
%add.ptr81 = getelementptr inbounds float, float* %pOutput.0218, i32 4
%add.ptr82 = getelementptr inbounds float, float* %pSamples.2.lcssa, i32 4
%add.ptr83 = getelementptr inbounds float, float* %add.ptr82, i32 %idx.neg
%dec84 = add nsw i32 %blkCnt.0222, -1
%cmp5 = icmp eq i32 %dec84, 0
br i1 %cmp5, label %if.end, label %while.body
if.end: ; preds = %while.end, %if.then, %entry
ret void
}
%struct.arm_biquad_cascade_stereo_df2T_instance_f32 = type { i8, float*, float* }
define arm_aapcs_vfpcc void @arm_biquad_cascade_stereo_df2T_f32(%struct.arm_biquad_cascade_stereo_df2T_instance_f32* nocapture readonly %0, float* %1, float* %2, i32 %3) {
; CHECK-LABEL: arm_biquad_cascade_stereo_df2T_f32:
; CHECK: @ %bb.0:
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 16:57:58 +01:00
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: .pad #24
; CHECK-NEXT: sub sp, #24
; CHECK-NEXT: mov r8, r3
; CHECK-NEXT: ldrb.w r12, [r0]
; CHECK-NEXT: ldrd r3, r0, [r0, #4]
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: cmp.w r8, #0
; CHECK-NEXT: strd r4, r4, [sp, #16]
; CHECK-NEXT: beq .LBB17_5
; CHECK-NEXT: @ %bb.1:
; CHECK-NEXT: movs r5, #2
; CHECK-NEXT: viwdup.u32 q0, r4, r5, #1
; CHECK-NEXT: mov r4, sp
; CHECK-NEXT: .LBB17_2: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB17_3 Depth 2
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 16:57:58 +01:00
; CHECK-NEXT: ldrd r5, r7, [r0]
; CHECK-NEXT: vldrw.u32 q1, [r3]
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 16:57:58 +01:00
; CHECK-NEXT: vldr s8, [r0, #8]
; CHECK-NEXT: ldr r6, [r0, #12]
; CHECK-NEXT: vstrw.32 q1, [r4]
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 16:57:58 +01:00
; CHECK-NEXT: vdup.32 q1, r7
; CHECK-NEXT: vldr s12, [r0, #16]
; CHECK-NEXT: vmov.f32 s6, s8
; CHECK-NEXT: dls lr, r8
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 16:57:58 +01:00
; CHECK-NEXT: vmov.f32 s7, s8
; CHECK-NEXT: vdup.32 q2, r6
; CHECK-NEXT: vmov.f32 s10, s12
; CHECK-NEXT: mov r7, r2
; CHECK-NEXT: vmov.f32 s11, s12
; CHECK-NEXT: .LBB17_3: @ Parent Loop BB17_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vldrw.u32 q4, [r1, q0, uxtw #2]
; CHECK-NEXT: vldrw.u32 q5, [r4, q0, uxtw #2]
; CHECK-NEXT: vldrw.u32 q3, [sp, #8]
; CHECK-NEXT: adds r1, #8
; CHECK-NEXT: vfma.f32 q5, q4, r5
; CHECK-NEXT: vfma.f32 q3, q5, q2
; CHECK-NEXT: vstmia r7!, {s20, s21}
; CHECK-NEXT: vfma.f32 q3, q4, q1
; CHECK-NEXT: vstrw.32 q3, [r4]
; CHECK-NEXT: le lr, .LBB17_3
; CHECK-NEXT: @ %bb.4: @ in Loop: Header=BB17_2 Depth=1
; CHECK-NEXT: subs.w r12, r12, #1
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 16:57:58 +01:00
; CHECK-NEXT: add.w r0, r0, #20
; CHECK-NEXT: vstrb.8 q3, [r3], #16
; CHECK-NEXT: mov r1, r2
; CHECK-NEXT: bne .LBB17_2
; CHECK-NEXT: b .LBB17_7
; CHECK-NEXT: .LBB17_5: @ %.preheader
; CHECK-NEXT: dls lr, r12
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: .LBB17_6: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r3], #16
; CHECK-NEXT: vstrw.32 q0, [r0]
; CHECK-NEXT: le lr, .LBB17_6
; CHECK-NEXT: .LBB17_7:
; CHECK-NEXT: add sp, #24
; CHECK-NEXT: vpop {d8, d9, d10, d11}
[ARM] Alter t2DoLoopStart to define lr This changes the definition of t2DoLoopStart from t2DoLoopStart rGPR to GPRlr = t2DoLoopStart rGPR This will hopefully mean that low overhead loops are more tied together, and we can more reliably generate loops without reverting or being at the whims of the register allocator. This is a fairly simple change in itself, but leads to a number of other required alterations. - The hardware loop pass, if UsePhi is set, now generates loops of the form: %start = llvm.start.loop.iterations(%N) loop: %p = phi [%start], [%dec] %dec = llvm.loop.decrement.reg(%p, 1) %c = icmp ne %dec, 0 br %c, loop, exit - For this a new llvm.start.loop.iterations intrinsic was added, identical to llvm.set.loop.iterations but produces a value as seen above, gluing the loop together more through def-use chains. - This new instrinsic conceptually produces the same output as input, which is taught to SCEV so that the checks in MVETailPredication are not affected. - Some minor changes are needed to the ARMLowOverheadLoop pass, but it has been left mostly as before. We should now more reliably be able to tell that the t2DoLoopStart is correct without having to prove it, but t2WhileLoopStart and tail-predicated loops will remain the same. - And all the tests have been updated. There are a lot of them! This patch on it's own might cause more trouble that it helps, with more tail-predicated loops being reverted, but some additional patches can hopefully improve upon that to get to something that is better overall. Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 16:57:58 +01:00
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
%5 = alloca [6 x float], align 4
%6 = getelementptr inbounds %struct.arm_biquad_cascade_stereo_df2T_instance_f32, %struct.arm_biquad_cascade_stereo_df2T_instance_f32* %0, i32 0, i32 1
%7 = load float*, float** %6, align 4
%8 = getelementptr inbounds %struct.arm_biquad_cascade_stereo_df2T_instance_f32, %struct.arm_biquad_cascade_stereo_df2T_instance_f32* %0, i32 0, i32 2
%9 = load float*, float** %8, align 4
%10 = getelementptr inbounds %struct.arm_biquad_cascade_stereo_df2T_instance_f32, %struct.arm_biquad_cascade_stereo_df2T_instance_f32* %0, i32 0, i32 0
%11 = load i8, i8* %10, align 4
%12 = zext i8 %11 to i32
%13 = bitcast [6 x float]* %5 to i8*
call void @llvm.lifetime.start.p0i8(i64 24, i8* nonnull %13) #5
%14 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32 0, i32 2, i32 1)
%15 = extractvalue { <4 x i32>, i32 } %14, 0
%16 = getelementptr inbounds [6 x float], [6 x float]* %5, i32 0, i32 4
store float 0.000000e+00, float* %16, align 4
%17 = getelementptr inbounds [6 x float], [6 x float]* %5, i32 0, i32 5
store float 0.000000e+00, float* %17, align 4
%18 = bitcast [6 x float]* %5 to <4 x float>*
%19 = icmp eq i32 %3, 0
%20 = bitcast [6 x float]* %5 to i32*
%21 = getelementptr inbounds [6 x float], [6 x float]* %5, i32 0, i32 2
%22 = bitcast float* %21 to <4 x float>*
br i1 %19, label %23, label %31
23: ; preds = %4, %23
%24 = phi i32 [ %29, %23 ], [ %12, %4 ]
%25 = phi float* [ %28, %23 ], [ %7, %4 ]
%26 = bitcast float* %25 to <4 x float>*
%27 = load <4 x float>, <4 x float>* %26, align 8
store <4 x float> %27, <4 x float>* %18, align 4
%28 = getelementptr inbounds float, float* %25, i32 4
%29 = add i32 %24, -1
%30 = icmp eq i32 %29, 0
br i1 %30, label %82, label %23
31: ; preds = %4, %77
%32 = phi i32 [ %80, %77 ], [ %12, %4 ]
%33 = phi float* [ %78, %77 ], [ %9, %4 ]
%34 = phi float* [ %79, %77 ], [ %7, %4 ]
%35 = phi float* [ %2, %77 ], [ %1, %4 ]
%36 = getelementptr inbounds float, float* %33, i32 1
%37 = load float, float* %33, align 4
%38 = getelementptr inbounds float, float* %33, i32 2
%39 = load float, float* %36, align 4
%40 = getelementptr inbounds float, float* %33, i32 3
%41 = load float, float* %38, align 4
%42 = getelementptr inbounds float, float* %33, i32 4
%43 = load float, float* %40, align 4
%44 = load float, float* %42, align 4
%45 = insertelement <4 x float> undef, float %43, i32 0
%46 = shufflevector <4 x float> %45, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
%47 = insertelement <4 x float> %46, float %44, i32 2
%48 = insertelement <4 x float> %47, float %44, i32 3
%49 = insertelement <4 x float> undef, float %39, i32 0
%50 = shufflevector <4 x float> %49, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 undef, i32 undef>
%51 = insertelement <4 x float> %50, float %41, i32 2
%52 = insertelement <4 x float> %51, float %41, i32 3
%53 = bitcast float* %34 to <4 x float>*
%54 = load <4 x float>, <4 x float>* %53, align 8
store <4 x float> %54, <4 x float>* %18, align 4
%55 = insertelement <4 x float> undef, float %37, i32 0
%56 = shufflevector <4 x float> %55, <4 x float> undef, <4 x i32> zeroinitializer
br label %57
57: ; preds = %31, %57
%58 = phi float* [ %35, %31 ], [ %74, %57 ]
%59 = phi float* [ %2, %31 ], [ %70, %57 ]
%60 = phi i32 [ %3, %31 ], [ %75, %57 ]
%61 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* nonnull %20, <4 x i32> %15, i32 32, i32 2, i32 1)
%62 = bitcast <4 x i32> %61 to <4 x float>
%63 = bitcast float* %58 to i32*
%64 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* %63, <4 x i32> %15, i32 32, i32 2, i32 1)
%65 = bitcast <4 x i32> %64 to <4 x float>
%66 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %65, <4 x float> %56, <4 x float> %62)
%67 = extractelement <4 x float> %66, i32 0
%68 = getelementptr inbounds float, float* %59, i32 1
store float %67, float* %59, align 4
%69 = extractelement <4 x float> %66, i32 1
%70 = getelementptr inbounds float, float* %59, i32 2
store float %69, float* %68, align 4
%71 = load <4 x float>, <4 x float>* %22, align 4
%72 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %66, <4 x float> %48, <4 x float> %71)
%73 = call fast <4 x float> @llvm.fma.v4f32(<4 x float> %65, <4 x float> %52, <4 x float> %72)
store <4 x float> %73, <4 x float>* %18, align 4
%74 = getelementptr inbounds float, float* %58, i32 2
%75 = add i32 %60, -1
%76 = icmp eq i32 %75, 0
br i1 %76, label %77, label %57
77: ; preds = %57
%78 = getelementptr inbounds float, float* %33, i32 5
store <4 x float> %73, <4 x float>* %53, align 4
%79 = getelementptr inbounds float, float* %34, i32 4
%80 = add i32 %32, -1
%81 = icmp eq i32 %80, 0
br i1 %81, label %82, label %31
82: ; preds = %77, %23
call void @llvm.lifetime.end.p0i8(i64 24, i8* nonnull %13) #5
ret void
}
define arm_aapcs_vfpcc void @fms(float* nocapture readonly %pSrc1, float* nocapture readonly %pSrc2, float* nocapture readonly %pSrc3, float* nocapture %pDst, i32 %N, i32 %M) {
; CHECK-LABEL: fms:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r7, lr}
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: ldr r4, [sp, #16]
; CHECK-NEXT: lsrs r5, r4, #2
; CHECK-NEXT: beq .LBB18_5
; CHECK-NEXT: @ %bb.1: @ %do.body.preheader
; CHECK-NEXT: ldr.w r12, [sp, #20]
; CHECK-NEXT: .LBB18_2: @ %do.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB18_3 Depth 2
; CHECK-NEXT: ldr r4, [r2]
; CHECK-NEXT: dls lr, r5
; CHECK-NEXT: vdup.32 q0, r4
; CHECK-NEXT: .LBB18_3: @ %while.body
; CHECK-NEXT: @ Parent Loop BB18_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
; CHECK-NEXT: vldrw.u32 q2, [r0], #16
; CHECK-NEXT: vfms.f32 q2, q1, q0
; CHECK-NEXT: vstrb.8 q2, [r3], #16
; CHECK-NEXT: le lr, .LBB18_3
; CHECK-NEXT: @ %bb.4: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB18_2 Depth=1
; CHECK-NEXT: subs.w r12, r12, #1
; CHECK-NEXT: add.w r2, r2, #4
; CHECK-NEXT: bne .LBB18_2
; CHECK-NEXT: .LBB18_5: @ %do.end
; CHECK-NEXT: pop {r4, r5, r7, pc}
entry:
%shr = lshr i32 %N, 2
%cmp15 = icmp eq i32 %shr, 0
br i1 %cmp15, label %do.end, label %do.body
do.body: ; preds = %entry, %while.end
%pDst.addr.0 = phi float* [ %add.ptr2, %while.end ], [ %pDst, %entry ]
%M.addr.0 = phi i32 [ %dec3, %while.end ], [ %M, %entry ]
%pSrc3.addr.0 = phi float* [ %incdec.ptr, %while.end ], [ %pSrc3, %entry ]
%pSrc2.addr.0 = phi float* [ %add.ptr1, %while.end ], [ %pSrc2, %entry ]
%pSrc1.addr.0 = phi float* [ %add.ptr, %while.end ], [ %pSrc1, %entry ]
%0 = load float, float* %pSrc3.addr.0, align 4
%.splatinsert = insertelement <4 x float> undef, float %0, i32 0
%.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
br label %while.body
while.body: ; preds = %do.body, %while.body
%pSrc1.addr.119 = phi float* [ %pSrc1.addr.0, %do.body ], [ %add.ptr, %while.body ]
%pSrc2.addr.118 = phi float* [ %pSrc2.addr.0, %do.body ], [ %add.ptr1, %while.body ]
%blkCnt.017 = phi i32 [ %shr, %do.body ], [ %dec, %while.body ]
%pDst.addr.116 = phi float* [ %pDst.addr.0, %do.body ], [ %add.ptr2, %while.body ]
%1 = bitcast float* %pSrc1.addr.119 to <4 x float>*
%2 = load <4 x float>, <4 x float>* %1, align 4
%3 = bitcast float* %pSrc2.addr.118 to <4 x float>*
%4 = load <4 x float>, <4 x float>* %3, align 4
%5 = fneg fast <4 x float> %4
%6 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %.splat, <4 x float> %5, <4 x float> %2)
%7 = bitcast float* %pDst.addr.116 to <4 x float>*
store <4 x float> %6, <4 x float>* %7, align 4
%add.ptr = getelementptr inbounds float, float* %pSrc1.addr.119, i32 4
%add.ptr1 = getelementptr inbounds float, float* %pSrc2.addr.118, i32 4
%add.ptr2 = getelementptr inbounds float, float* %pDst.addr.116, i32 4
%dec = add nsw i32 %blkCnt.017, -1
%cmp = icmp eq i32 %dec, 0
br i1 %cmp, label %while.end, label %while.body
while.end: ; preds = %while.body
%incdec.ptr = getelementptr inbounds float, float* %pSrc3.addr.0, i32 1
%dec3 = add i32 %M.addr.0, -1
%cmp4 = icmp eq i32 %dec3, 0
br i1 %cmp4, label %do.end, label %do.body
do.end: ; preds = %while.end, %entry
ret void
}
%struct.arm_biquad_casd_df1_inst_f32 = type { i32, float*, float* }
define arm_aapcs_vfpcc void @arm_biquad_cascade_df1_f32(%struct.arm_biquad_casd_df1_inst_f32* nocapture readonly %S, float* nocapture readonly %pSrc, float* nocapture %pDst, i32 %blockSize) {
; CHECK-LABEL: arm_biquad_cascade_df1_f32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: .pad #64
; CHECK-NEXT: sub sp, #64
; CHECK-NEXT: ldrd r12, r10, [r0]
; CHECK-NEXT: @ implicit-def: $s2
; CHECK-NEXT: and r7, r3, #3
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: ldr.w r9, [r0, #8]
; CHECK-NEXT: lsrs r0, r3, #2
; CHECK-NEXT: str r0, [sp] @ 4-byte Spill
; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: str r2, [sp, #44] @ 4-byte Spill
; CHECK-NEXT: b .LBB19_3
; CHECK-NEXT: .LBB19_1: @ in Loop: Header=BB19_3 Depth=1
; CHECK-NEXT: vmov.f32 s14, s7
; CHECK-NEXT: ldr r2, [sp, #44] @ 4-byte Reload
; CHECK-NEXT: vmov.f32 s0, s10
; CHECK-NEXT: vmov.f32 s7, s6
; CHECK-NEXT: .LBB19_2: @ %if.end69
; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1
; CHECK-NEXT: vstr s8, [r10]
; CHECK-NEXT: subs.w r12, r12, #1
; CHECK-NEXT: vstr s0, [r10, #4]
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: add.w r9, r9, #128
; CHECK-NEXT: vstr s14, [r10, #8]
; CHECK-NEXT: mov r1, r2
; CHECK-NEXT: vstr s7, [r10, #12]
; CHECK-NEXT: add.w r10, r10, #16
; CHECK-NEXT: beq.w .LBB19_13
; CHECK-NEXT: .LBB19_3: @ %do.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB19_5 Depth 2
; CHECK-NEXT: vldr s7, [r10, #8]
; CHECK-NEXT: mov r5, r2
; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
; CHECK-NEXT: vldr s8, [r10]
; CHECK-NEXT: vldr s10, [r10, #4]
; CHECK-NEXT: vldr s6, [r10, #12]
; CHECK-NEXT: wls lr, r0, .LBB19_6
; CHECK-NEXT: @ %bb.4: @ %while.body.lr.ph
; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1
; CHECK-NEXT: ldr r5, [sp, #44] @ 4-byte Reload
; CHECK-NEXT: .LBB19_5: @ %while.body
; CHECK-NEXT: @ Parent Loop BB19_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vmov r4, s8
; CHECK-NEXT: vldr s8, [r1, #12]
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vldrw.u32 q0, [r9, #112]
; CHECK-NEXT: vmov r3, s10
; CHECK-NEXT: vldr s10, [r1, #8]
; CHECK-NEXT: vmov r7, s7
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vmov r11, s6
; CHECK-NEXT: vldrw.u32 q1, [r9]
; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill
; CHECK-NEXT: vmov r8, s8
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vldrw.u32 q0, [r9, #16]
; CHECK-NEXT: ldr r6, [r1, #4]
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vldrw.u32 q7, [r9, #32]
; CHECK-NEXT: vmul.f32 q1, q1, r8
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vmov r0, s10
; CHECK-NEXT: vldrw.u32 q3, [r9, #48]
; CHECK-NEXT: vfma.f32 q1, q0, r0
; CHECK-NEXT: ldr r0, [r1], #16
; CHECK-NEXT: vfma.f32 q1, q7, r6
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vldrw.u32 q6, [r9, #64]
; CHECK-NEXT: vfma.f32 q1, q3, r0
; CHECK-NEXT: vldrw.u32 q5, [r9, #80]
; CHECK-NEXT: vfma.f32 q1, q6, r4
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vldrw.u32 q4, [r9, #96]
; CHECK-NEXT: vfma.f32 q1, q5, r3
; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload
; CHECK-NEXT: vfma.f32 q1, q4, r7
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vfma.f32 q1, q0, r11
; CHECK-NEXT: vmov.f32 s2, s8
; CHECK-NEXT: vstrb.8 q1, [r5], #16
; CHECK-NEXT: le lr, .LBB19_5
; CHECK-NEXT: .LBB19_6: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1
; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: cmp r7, #0
; CHECK-NEXT: beq .LBB19_1
; CHECK-NEXT: @ %bb.7: @ %if.then
; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1
; CHECK-NEXT: vldr s24, [r1]
; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vldr s0, [r1, #4]
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vldrw.u32 q3, [r9]
; CHECK-NEXT: vldr s3, [r1, #12]
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vldrw.u32 q4, [r9, #32]
; CHECK-NEXT: vldr s1, [r1, #8]
; CHECK-NEXT: vmov r1, s10
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vldrw.u32 q2, [r9, #96]
; CHECK-NEXT: vmov r6, s3
; CHECK-NEXT: vmul.f32 q3, q3, r6
; CHECK-NEXT: vmov r6, s1
; CHECK-NEXT: vstrw.32 q2, [sp, #24] @ 16-byte Spill
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vldrw.u32 q2, [r9, #112]
; CHECK-NEXT: vldrw.u32 q5, [r9, #48]
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vldrw.u32 q2, [r9, #80]
; CHECK-NEXT: vldrw.u32 q7, [r9, #64]
; CHECK-NEXT: vmov r3, s24
; CHECK-NEXT: vstrw.32 q2, [sp, #8] @ 16-byte Spill
[ARM] Improve WLS lowering Recently we improved the lowering of low overhead loops and tail predicated loops, but concentrated first on the DLS do style loops. This extends those improvements over to the WLS while loops, improving the chance of lowering them successfully. To do this the lowering has to change a little as the instructions are terminators that produce a value - something that needs to be treated carefully. Lowering starts at the Hardware Loop pass, inserting a new llvm.test.start.loop.iterations that produces both an i1 to control the loop entry and an i32 similar to the llvm.start.loop.iterations intrinsic added for do loops. This feeds into the loop phi, properly gluing the values together: %wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div) %wls0 = extractvalue { i32, i1 } %wls, 0 %wls1 = extractvalue { i32, i1 } %wls, 1 br i1 %wls1, label %loop.ph, label %loop.exit ... loop: %lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ] .. %iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1) %cmp = icmp ne i32 %iv.next, 0 br i1 %cmp, label %loop, label %loop.exit The llvm.test.start.loop.iterations need to be lowered through ISel lowering as a pair of WLS and WLSSETUP nodes, which each get converted to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent t2WhileLoopStart from being a terminator that produces a value, something difficult to control at that stage in the pipeline. Instead the t2WhileLoopSetup produces the value of LR (essentially acting as a lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc). These are then converted into a single t2WhileLoopStartLR at the same point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop to prevent them from progressing further in the pipeline. The t2WhileLoopStartLR is a single instruction that takes a GPR and produces LR, similar to the WLS instruction. %1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3 t2B %bb.1 ... bb.2.loop: %2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2 ... %3:gprlr = t2LoopEndDec %2:gprlr, %bb.2 t2B %bb.3 The t2WhileLoopStartLR can then be treated similar to the other low overhead loop pseudos, eventually being lowered to a WLS providing the branches are within range. Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
; CHECK-NEXT: vldrw.u32 q2, [r9, #16]
; CHECK-NEXT: vmov r2, s7
; CHECK-NEXT: cmp r7, #1
; CHECK-NEXT: vfma.f32 q3, q2, r6
; CHECK-NEXT: vldrw.u32 q2, [sp, #8] @ 16-byte Reload
; CHECK-NEXT: vfma.f32 q3, q4, r4
; CHECK-NEXT: vmov lr, s6
; CHECK-NEXT: vfma.f32 q3, q5, r3
; CHECK-NEXT: vfma.f32 q3, q7, r0
; CHECK-NEXT: vfma.f32 q3, q2, r1
; CHECK-NEXT: vldrw.u32 q2, [sp, #24] @ 16-byte Reload
; CHECK-NEXT: vfma.f32 q3, q2, r2
; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload
; CHECK-NEXT: vfma.f32 q3, q2, lr
; CHECK-NEXT: bne .LBB19_9
; CHECK-NEXT: @ %bb.8: @ %if.then58
; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1
; CHECK-NEXT: vstr s12, [r5]
; CHECK-NEXT: vmov.f32 s8, s24
; CHECK-NEXT: vmov.f32 s0, s2
; CHECK-NEXT: vmov.f32 s14, s12
; CHECK-NEXT: b .LBB19_11
; CHECK-NEXT: .LBB19_9: @ %if.else
; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1
; CHECK-NEXT: cmp r7, #2
; CHECK-NEXT: vstmia r5, {s12, s13}
; CHECK-NEXT: bne .LBB19_12
; CHECK-NEXT: @ %bb.10: @ in Loop: Header=BB19_3 Depth=1
; CHECK-NEXT: vmov.f32 s8, s0
; CHECK-NEXT: vmov.f32 s14, s13
; CHECK-NEXT: vmov.f32 s0, s24
; CHECK-NEXT: vmov.f32 s7, s12
; CHECK-NEXT: .LBB19_11: @ %if.end69
; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1
; CHECK-NEXT: vmov.f32 s2, s3
; CHECK-NEXT: ldr r2, [sp, #44] @ 4-byte Reload
; CHECK-NEXT: b .LBB19_2
; CHECK-NEXT: .LBB19_12: @ %if.else64
; CHECK-NEXT: @ in Loop: Header=BB19_3 Depth=1
; CHECK-NEXT: vmov.f32 s7, s13
; CHECK-NEXT: ldr r2, [sp, #44] @ 4-byte Reload
; CHECK-NEXT: vmov.f32 s2, s3
; CHECK-NEXT: vstr s14, [r5, #8]
; CHECK-NEXT: vmov.f32 s8, s1
; CHECK-NEXT: b .LBB19_2
; CHECK-NEXT: .LBB19_13: @ %do.end
; CHECK-NEXT: add sp, #64
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
%pState1 = getelementptr inbounds %struct.arm_biquad_casd_df1_inst_f32, %struct.arm_biquad_casd_df1_inst_f32* %S, i32 0, i32 1
%0 = load float*, float** %pState1, align 4
%pCoeffs2 = getelementptr inbounds %struct.arm_biquad_casd_df1_inst_f32, %struct.arm_biquad_casd_df1_inst_f32* %S, i32 0, i32 2
%1 = load float*, float** %pCoeffs2, align 4
%numStages = getelementptr inbounds %struct.arm_biquad_casd_df1_inst_f32, %struct.arm_biquad_casd_df1_inst_f32* %S, i32 0, i32 0
%2 = load i32, i32* %numStages, align 4
%shr = lshr i32 %blockSize, 2
%cmp201 = icmp eq i32 %shr, 0
%and = and i32 %blockSize, 3
%tobool = icmp eq i32 %and, 0
%cmp57 = icmp eq i32 %and, 1
%cmp60 = icmp eq i32 %and, 2
br label %do.body
do.body: ; preds = %if.end69, %entry
%pState.0 = phi float* [ %0, %entry ], [ %incdec.ptr73, %if.end69 ]
%pCoeffs.0 = phi float* [ %1, %entry ], [ %add.ptr74, %if.end69 ]
%pIn.0 = phi float* [ %pSrc, %entry ], [ %pDst, %if.end69 ]
%X3.0 = phi float [ undef, %entry ], [ %X3.2, %if.end69 ]
%stage.0 = phi i32 [ %2, %entry ], [ %dec75, %if.end69 ]
%3 = load float, float* %pState.0, align 4
%arrayidx3 = getelementptr inbounds float, float* %pState.0, i32 1
%4 = load float, float* %arrayidx3, align 4
%arrayidx4 = getelementptr inbounds float, float* %pState.0, i32 2
%5 = load float, float* %arrayidx4, align 4
%arrayidx5 = getelementptr inbounds float, float* %pState.0, i32 3
%6 = load float, float* %arrayidx5, align 4
br i1 %cmp201, label %while.end, label %while.body.lr.ph
while.body.lr.ph: ; preds = %do.body
%7 = bitcast float* %pCoeffs.0 to <4 x float>*
%arrayidx9 = getelementptr inbounds float, float* %pCoeffs.0, i32 4
%8 = bitcast float* %arrayidx9 to <4 x float>*
%arrayidx12 = getelementptr inbounds float, float* %pCoeffs.0, i32 8
%9 = bitcast float* %arrayidx12 to <4 x float>*
%arrayidx15 = getelementptr inbounds float, float* %pCoeffs.0, i32 12
%10 = bitcast float* %arrayidx15 to <4 x float>*
%arrayidx18 = getelementptr inbounds float, float* %pCoeffs.0, i32 16
%11 = bitcast float* %arrayidx18 to <4 x float>*
%arrayidx21 = getelementptr inbounds float, float* %pCoeffs.0, i32 20
%12 = bitcast float* %arrayidx21 to <4 x float>*
%arrayidx24 = getelementptr inbounds float, float* %pCoeffs.0, i32 24
%13 = bitcast float* %arrayidx24 to <4 x float>*
%arrayidx27 = getelementptr inbounds float, float* %pCoeffs.0, i32 28
%14 = bitcast float* %arrayidx27 to <4 x float>*
br label %while.body
while.body: ; preds = %while.body.lr.ph, %while.body
%sample.0208 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec, %while.body ]
%pIn.1207 = phi float* [ %pIn.0, %while.body.lr.ph ], [ %incdec.ptr8, %while.body ]
%pOut.1206 = phi float* [ %pDst, %while.body.lr.ph ], [ %add.ptr, %while.body ]
%Yn2.0205 = phi float [ %6, %while.body.lr.ph ], [ %37, %while.body ]
%Yn1.0204 = phi float [ %5, %while.body.lr.ph ], [ %36, %while.body ]
%Xn2.0203 = phi float [ %4, %while.body.lr.ph ], [ %17, %while.body ]
%Xn1.0202 = phi float [ %3, %while.body.lr.ph ], [ %18, %while.body ]
%incdec.ptr = getelementptr inbounds float, float* %pIn.1207, i32 1
%15 = load float, float* %pIn.1207, align 4
%incdec.ptr6 = getelementptr inbounds float, float* %pIn.1207, i32 2
%16 = load float, float* %incdec.ptr, align 4
%incdec.ptr7 = getelementptr inbounds float, float* %pIn.1207, i32 3
%17 = load float, float* %incdec.ptr6, align 4
%incdec.ptr8 = getelementptr inbounds float, float* %pIn.1207, i32 4
%18 = load float, float* %incdec.ptr7, align 4
%19 = load <4 x float>, <4 x float>* %7, align 4
%.splatinsert = insertelement <4 x float> undef, float %18, i32 0
%.splat = shufflevector <4 x float> %.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
%20 = fmul fast <4 x float> %.splat, %19
%21 = load <4 x float>, <4 x float>* %8, align 4
%.splatinsert10 = insertelement <4 x float> undef, float %17, i32 0
%.splat11 = shufflevector <4 x float> %.splatinsert10, <4 x float> undef, <4 x i32> zeroinitializer
%22 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %21, <4 x float> %.splat11, <4 x float> %20)
%23 = load <4 x float>, <4 x float>* %9, align 4
%.splatinsert13 = insertelement <4 x float> undef, float %16, i32 0
%.splat14 = shufflevector <4 x float> %.splatinsert13, <4 x float> undef, <4 x i32> zeroinitializer
%24 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %23, <4 x float> %.splat14, <4 x float> %22)
%25 = load <4 x float>, <4 x float>* %10, align 4
%.splatinsert16 = insertelement <4 x float> undef, float %15, i32 0
%.splat17 = shufflevector <4 x float> %.splatinsert16, <4 x float> undef, <4 x i32> zeroinitializer
%26 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %25, <4 x float> %.splat17, <4 x float> %24)
%27 = load <4 x float>, <4 x float>* %11, align 4
%.splatinsert19 = insertelement <4 x float> undef, float %Xn1.0202, i32 0
%.splat20 = shufflevector <4 x float> %.splatinsert19, <4 x float> undef, <4 x i32> zeroinitializer
%28 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %27, <4 x float> %.splat20, <4 x float> %26)
%29 = load <4 x float>, <4 x float>* %12, align 4
%.splatinsert22 = insertelement <4 x float> undef, float %Xn2.0203, i32 0
%.splat23 = shufflevector <4 x float> %.splatinsert22, <4 x float> undef, <4 x i32> zeroinitializer
%30 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %29, <4 x float> %.splat23, <4 x float> %28)
%31 = load <4 x float>, <4 x float>* %13, align 4
%.splatinsert25 = insertelement <4 x float> undef, float %Yn1.0204, i32 0
%.splat26 = shufflevector <4 x float> %.splatinsert25, <4 x float> undef, <4 x i32> zeroinitializer
%32 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %31, <4 x float> %.splat26, <4 x float> %30)
%33 = load <4 x float>, <4 x float>* %14, align 4
%.splatinsert28 = insertelement <4 x float> undef, float %Yn2.0205, i32 0
%.splat29 = shufflevector <4 x float> %.splatinsert28, <4 x float> undef, <4 x i32> zeroinitializer
%34 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %33, <4 x float> %.splat29, <4 x float> %32)
%35 = bitcast float* %pOut.1206 to <4 x float>*
store <4 x float> %34, <4 x float>* %35, align 4
%add.ptr = getelementptr inbounds float, float* %pOut.1206, i32 4
%36 = extractelement <4 x float> %34, i32 3
%37 = extractelement <4 x float> %34, i32 2
%dec = add nsw i32 %sample.0208, -1
%cmp = icmp eq i32 %dec, 0
br i1 %cmp, label %while.end, label %while.body
while.end: ; preds = %while.body, %do.body
%Xn1.0.lcssa = phi float [ %3, %do.body ], [ %18, %while.body ]
%Xn2.0.lcssa = phi float [ %4, %do.body ], [ %17, %while.body ]
%Yn1.0.lcssa = phi float [ %5, %do.body ], [ %36, %while.body ]
%Yn2.0.lcssa = phi float [ %6, %do.body ], [ %37, %while.body ]
%pOut.1.lcssa = phi float* [ %pDst, %do.body ], [ %add.ptr, %while.body ]
%pIn.1.lcssa = phi float* [ %pIn.0, %do.body ], [ %incdec.ptr8, %while.body ]
%X3.1.lcssa = phi float [ %X3.0, %do.body ], [ %18, %while.body ]
br i1 %tobool, label %if.end69, label %if.then
if.then: ; preds = %while.end
%incdec.ptr30 = getelementptr inbounds float, float* %pIn.1.lcssa, i32 1
%38 = load float, float* %pIn.1.lcssa, align 4
%incdec.ptr31 = getelementptr inbounds float, float* %pIn.1.lcssa, i32 2
%39 = load float, float* %incdec.ptr30, align 4
%incdec.ptr32 = getelementptr inbounds float, float* %pIn.1.lcssa, i32 3
%40 = load float, float* %incdec.ptr31, align 4
%41 = load float, float* %incdec.ptr32, align 4
%42 = bitcast float* %pCoeffs.0 to <4 x float>*
%43 = load <4 x float>, <4 x float>* %42, align 4
%.splatinsert34 = insertelement <4 x float> undef, float %41, i32 0
%.splat35 = shufflevector <4 x float> %.splatinsert34, <4 x float> undef, <4 x i32> zeroinitializer
%44 = fmul fast <4 x float> %.splat35, %43
%arrayidx36 = getelementptr inbounds float, float* %pCoeffs.0, i32 4
%45 = bitcast float* %arrayidx36 to <4 x float>*
%46 = load <4 x float>, <4 x float>* %45, align 4
%.splatinsert37 = insertelement <4 x float> undef, float %40, i32 0
%.splat38 = shufflevector <4 x float> %.splatinsert37, <4 x float> undef, <4 x i32> zeroinitializer
%47 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %46, <4 x float> %.splat38, <4 x float> %44)
%arrayidx39 = getelementptr inbounds float, float* %pCoeffs.0, i32 8
%48 = bitcast float* %arrayidx39 to <4 x float>*
%49 = load <4 x float>, <4 x float>* %48, align 4
%.splatinsert40 = insertelement <4 x float> undef, float %39, i32 0
%.splat41 = shufflevector <4 x float> %.splatinsert40, <4 x float> undef, <4 x i32> zeroinitializer
%50 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %49, <4 x float> %.splat41, <4 x float> %47)
%arrayidx42 = getelementptr inbounds float, float* %pCoeffs.0, i32 12
%51 = bitcast float* %arrayidx42 to <4 x float>*
%52 = load <4 x float>, <4 x float>* %51, align 4
%.splatinsert43 = insertelement <4 x float> undef, float %38, i32 0
%.splat44 = shufflevector <4 x float> %.splatinsert43, <4 x float> undef, <4 x i32> zeroinitializer
%53 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %52, <4 x float> %.splat44, <4 x float> %50)
%arrayidx45 = getelementptr inbounds float, float* %pCoeffs.0, i32 16
%54 = bitcast float* %arrayidx45 to <4 x float>*
%55 = load <4 x float>, <4 x float>* %54, align 4
%.splatinsert46 = insertelement <4 x float> undef, float %Xn1.0.lcssa, i32 0
%.splat47 = shufflevector <4 x float> %.splatinsert46, <4 x float> undef, <4 x i32> zeroinitializer
%56 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %55, <4 x float> %.splat47, <4 x float> %53)
%arrayidx48 = getelementptr inbounds float, float* %pCoeffs.0, i32 20
%57 = bitcast float* %arrayidx48 to <4 x float>*
%58 = load <4 x float>, <4 x float>* %57, align 4
%.splatinsert49 = insertelement <4 x float> undef, float %Xn2.0.lcssa, i32 0
%.splat50 = shufflevector <4 x float> %.splatinsert49, <4 x float> undef, <4 x i32> zeroinitializer
%59 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %58, <4 x float> %.splat50, <4 x float> %56)
%arrayidx51 = getelementptr inbounds float, float* %pCoeffs.0, i32 24
%60 = bitcast float* %arrayidx51 to <4 x float>*
%61 = load <4 x float>, <4 x float>* %60, align 4
%.splatinsert52 = insertelement <4 x float> undef, float %Yn1.0.lcssa, i32 0
%.splat53 = shufflevector <4 x float> %.splatinsert52, <4 x float> undef, <4 x i32> zeroinitializer
%62 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %61, <4 x float> %.splat53, <4 x float> %59)
%arrayidx54 = getelementptr inbounds float, float* %pCoeffs.0, i32 28
%63 = bitcast float* %arrayidx54 to <4 x float>*
%64 = load <4 x float>, <4 x float>* %63, align 4
%.splatinsert55 = insertelement <4 x float> undef, float %Yn2.0.lcssa, i32 0
%.splat56 = shufflevector <4 x float> %.splatinsert55, <4 x float> undef, <4 x i32> zeroinitializer
%65 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %64, <4 x float> %.splat56, <4 x float> %62)
%66 = extractelement <4 x float> %65, i32 0
br i1 %cmp57, label %if.then58, label %if.else
if.then58: ; preds = %if.then
store float %66, float* %pOut.1.lcssa, align 4
br label %if.end69
if.else: ; preds = %if.then
%incdec.ptr62 = getelementptr inbounds float, float* %pOut.1.lcssa, i32 1
store float %66, float* %pOut.1.lcssa, align 4
%67 = extractelement <4 x float> %65, i32 1
store float %67, float* %incdec.ptr62, align 4
br i1 %cmp60, label %if.end69, label %if.else64
if.else64: ; preds = %if.else
%incdec.ptr63 = getelementptr inbounds float, float* %pOut.1.lcssa, i32 2
%68 = extractelement <4 x float> %65, i32 2
store float %68, float* %incdec.ptr63, align 4
br label %if.end69
if.end69: ; preds = %if.else, %while.end, %if.then58, %if.else64
%Xn1.1 = phi float [ %38, %if.then58 ], [ %40, %if.else64 ], [ %Xn1.0.lcssa, %while.end ], [ %39, %if.else ]
%Xn2.1 = phi float [ %X3.1.lcssa, %if.then58 ], [ %39, %if.else64 ], [ %Xn2.0.lcssa, %while.end ], [ %38, %if.else ]
%Yn1.1 = phi float [ %66, %if.then58 ], [ %68, %if.else64 ], [ %Yn1.0.lcssa, %while.end ], [ %67, %if.else ]
%Yn2.1 = phi float [ %Yn1.0.lcssa, %if.then58 ], [ %67, %if.else64 ], [ %Yn2.0.lcssa, %while.end ], [ %66, %if.else ]
%X3.2 = phi float [ %41, %if.then58 ], [ %41, %if.else64 ], [ %X3.1.lcssa, %while.end ], [ %41, %if.else ]
store float %Xn1.1, float* %pState.0, align 4
store float %Xn2.1, float* %arrayidx3, align 4
store float %Yn1.1, float* %arrayidx4, align 4
%incdec.ptr73 = getelementptr inbounds float, float* %pState.0, i32 4
store float %Yn2.1, float* %arrayidx5, align 4
%add.ptr74 = getelementptr inbounds float, float* %pCoeffs.0, i32 32
%dec75 = add i32 %stage.0, -1
%cmp76 = icmp eq i32 %dec75, 0
br i1 %cmp76, label %do.end, label %do.body
do.end: ; preds = %if.end69
ret void
}
%struct.arm_biquad_cascade_df2T_instance_f32 = type { i8, float*, float* }
define void @arm_biquad_cascade_df2T_f32(%struct.arm_biquad_cascade_df2T_instance_f32* nocapture readonly %S, float* nocapture readonly %pSrc, float* nocapture %pDst, i32 %blockSize) {
; CHECK-LABEL: arm_biquad_cascade_df2T_f32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: ldrd r12, r6, [r0, #4]
; CHECK-NEXT: and r8, r3, #1
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: lsrs r3, r3, #1
; CHECK-NEXT: vldr s0, .LCPI20_0
; CHECK-NEXT: b .LBB20_3
; CHECK-NEXT: .LBB20_1: @ %if.else
; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1
; CHECK-NEXT: vstr s4, [r12]
; CHECK-NEXT: vmov.f32 s6, s5
; CHECK-NEXT: .LBB20_2: @ %if.end
; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1
; CHECK-NEXT: vstr s6, [r12, #4]
; CHECK-NEXT: adds r6, #20
; CHECK-NEXT: subs r0, #1
; CHECK-NEXT: add.w r12, r12, #8
; CHECK-NEXT: mov r1, r2
; CHECK-NEXT: beq .LBB20_8
; CHECK-NEXT: .LBB20_3: @ %do.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB20_5 Depth 2
; CHECK-NEXT: vldrw.u32 q3, [r6]
; CHECK-NEXT: movs r5, #0
; CHECK-NEXT: vmov q4, q3
; CHECK-NEXT: vshlc q4, r5, #32
; CHECK-NEXT: vldrw.u32 q2, [r6, #8]
; CHECK-NEXT: vmov q5, q2
; CHECK-NEXT: vshlc q5, r5, #32
; CHECK-NEXT: vldrw.u32 q1, [r12]
; CHECK-NEXT: vmov.f32 s6, s0
; CHECK-NEXT: mov r5, r2
; CHECK-NEXT: vmov.f32 s7, s0
; CHECK-NEXT: wls lr, r3, .LBB20_6
; CHECK-NEXT: @ %bb.4: @ %while.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1
; CHECK-NEXT: vmov q6, q1
; CHECK-NEXT: mov r5, r2
; CHECK-NEXT: .LBB20_5: @ %while.body
; CHECK-NEXT: @ Parent Loop BB20_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: ldrd r7, r4, [r1], #8
; CHECK-NEXT: vfma.f32 q6, q3, r7
; CHECK-NEXT: vmov r7, s24
; CHECK-NEXT: vmov q1, q6
; CHECK-NEXT: vfma.f32 q1, q2, r7
; CHECK-NEXT: vstr s24, [r5]
; CHECK-NEXT: vmov.f32 s7, s0
; CHECK-NEXT: vfma.f32 q1, q4, r4
; CHECK-NEXT: vmov r4, s5
; CHECK-NEXT: vstr s5, [r5, #4]
; CHECK-NEXT: vfma.f32 q1, q5, r4
; CHECK-NEXT: adds r5, #8
; CHECK-NEXT: vmov.f32 s4, s6
; CHECK-NEXT: vmov.f32 s5, s7
; CHECK-NEXT: vmov.f32 s6, s0
; CHECK-NEXT: vmov q6, q1
; CHECK-NEXT: le lr, .LBB20_5
; CHECK-NEXT: .LBB20_6: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1
; CHECK-NEXT: cmp.w r8, #0
; CHECK-NEXT: beq .LBB20_1
; CHECK-NEXT: @ %bb.7: @ %if.then
; CHECK-NEXT: @ in Loop: Header=BB20_3 Depth=1
; CHECK-NEXT: ldr r1, [r1]
; CHECK-NEXT: vfma.f32 q1, q3, r1
; CHECK-NEXT: vmov r1, s4
; CHECK-NEXT: vstr s4, [r5]
; CHECK-NEXT: vfma.f32 q1, q2, r1
; CHECK-NEXT: vstr s5, [r12]
; CHECK-NEXT: b .LBB20_2
; CHECK-NEXT: .LBB20_8: @ %do.end
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: @ %bb.9:
; CHECK-NEXT: .LCPI20_0:
; CHECK-NEXT: .long 0x00000000 @ float 0
entry:
%pState1 = getelementptr inbounds %struct.arm_biquad_cascade_df2T_instance_f32, %struct.arm_biquad_cascade_df2T_instance_f32* %S, i32 0, i32 1
%0 = load float*, float** %pState1, align 4
%numStages = getelementptr inbounds %struct.arm_biquad_cascade_df2T_instance_f32, %struct.arm_biquad_cascade_df2T_instance_f32* %S, i32 0, i32 0
%1 = load i8, i8* %numStages, align 4
%conv = zext i8 %1 to i32
%pCoeffs = getelementptr inbounds %struct.arm_biquad_cascade_df2T_instance_f32, %struct.arm_biquad_cascade_df2T_instance_f32* %S, i32 0, i32 2
%2 = load float*, float** %pCoeffs, align 4
%div = lshr i32 %blockSize, 1
%cmp.not90 = icmp eq i32 %div, 0
%and = and i32 %blockSize, 1
%tobool.not = icmp eq i32 %and, 0
br label %do.body
do.body: ; preds = %if.end, %entry
%stage.0 = phi i32 [ %conv, %entry ], [ %dec23, %if.end ]
%pCurCoeffs.0 = phi float* [ %2, %entry ], [ %add.ptr2, %if.end ]
%pState.0 = phi float* [ %0, %entry ], [ %pState.1, %if.end ]
%pIn.0 = phi float* [ %pSrc, %entry ], [ %pDst, %if.end ]
%3 = bitcast float* %pCurCoeffs.0 to <4 x float>*
%4 = load <4 x float>, <4 x float>* %3, align 4
%add.ptr = getelementptr inbounds float, float* %pCurCoeffs.0, i32 2
%5 = bitcast float* %add.ptr to <4 x float>*
%6 = load <4 x float>, <4 x float>* %5, align 4
%add.ptr2 = getelementptr inbounds float, float* %pCurCoeffs.0, i32 5
%7 = bitcast float* %pState.0 to <4 x float>*
%8 = load <4 x float>, <4 x float>* %7, align 8
%9 = shufflevector <4 x float> %8, <4 x float> <float poison, float poison, float 0.000000e+00, float 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
%10 = bitcast <4 x float> %4 to <4 x i32>
%11 = tail call { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32> %10, i32 0, i32 32)
%12 = extractvalue { i32, <4 x i32> } %11, 0
%13 = extractvalue { i32, <4 x i32> } %11, 1
%14 = bitcast <4 x i32> %13 to <4 x float>
%15 = bitcast <4 x float> %6 to <4 x i32>
%16 = tail call { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32> %15, i32 %12, i32 32)
%17 = extractvalue { i32, <4 x i32> } %16, 1
%18 = bitcast <4 x i32> %17 to <4 x float>
br i1 %cmp.not90, label %while.end, label %while.body
while.body: ; preds = %do.body, %while.body
%pIn.194 = phi float* [ %incdec.ptr4, %while.body ], [ %pIn.0, %do.body ]
%state.093 = phi <4 x float> [ %30, %while.body ], [ %9, %do.body ]
%pOut.192 = phi float* [ %incdec.ptr12, %while.body ], [ %pDst, %do.body ]
%sample.091 = phi i32 [ %dec, %while.body ], [ %div, %do.body ]
%incdec.ptr = getelementptr inbounds float, float* %pIn.194, i32 1
%19 = load float, float* %pIn.194, align 4
%incdec.ptr4 = getelementptr inbounds float, float* %pIn.194, i32 2
%20 = load float, float* %incdec.ptr, align 4
%.splatinsert = insertelement <4 x float> poison, float %19, i32 0
%.splat = shufflevector <4 x float> %.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
%21 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %4, <4 x float> %.splat, <4 x float> %state.093)
%22 = extractelement <4 x float> %21, i32 0
%.splat6 = shufflevector <4 x float> %21, <4 x float> poison, <4 x i32> zeroinitializer
%23 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %6, <4 x float> %.splat6, <4 x float> %21)
%24 = insertelement <4 x float> %23, float 0.000000e+00, i32 3
%.splatinsert7 = insertelement <4 x float> poison, float %20, i32 0
%.splat8 = shufflevector <4 x float> %.splatinsert7, <4 x float> poison, <4 x i32> zeroinitializer
%25 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %14, <4 x float> %.splat8, <4 x float> %24)
%26 = extractelement <4 x float> %25, i32 1
%.splat10 = shufflevector <4 x float> %25, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
%27 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %18, <4 x float> %.splat10, <4 x float> %25)
%28 = shufflevector <4 x float> %27, <4 x float> undef, <4 x i32> <i32 2, i32 undef, i32 undef, i32 3>
%29 = insertelement <4 x float> %28, float 0.000000e+00, i32 2
%30 = shufflevector <4 x float> %29, <4 x float> %27, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
%incdec.ptr11 = getelementptr inbounds float, float* %pOut.192, i32 1
store float %22, float* %pOut.192, align 4
%incdec.ptr12 = getelementptr inbounds float, float* %pOut.192, i32 2
store float %26, float* %incdec.ptr11, align 4
%dec = add nsw i32 %sample.091, -1
%cmp.not = icmp eq i32 %dec, 0
br i1 %cmp.not, label %while.end, label %while.body
while.end: ; preds = %while.body, %do.body
%pOut.1.lcssa = phi float* [ %pDst, %do.body ], [ %incdec.ptr12, %while.body ]
%state.0.lcssa = phi <4 x float> [ %9, %do.body ], [ %30, %while.body ]
%pIn.1.lcssa = phi float* [ %pIn.0, %do.body ], [ %incdec.ptr4, %while.body ]
br i1 %tobool.not, label %if.else, label %if.then
if.then: ; preds = %while.end
%31 = load float, float* %pIn.1.lcssa, align 4
%.splatinsert14 = insertelement <4 x float> poison, float %31, i32 0
%.splat15 = shufflevector <4 x float> %.splatinsert14, <4 x float> poison, <4 x i32> zeroinitializer
%32 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %4, <4 x float> %.splat15, <4 x float> %state.0.lcssa)
%33 = extractelement <4 x float> %32, i32 0
%.splat17 = shufflevector <4 x float> %32, <4 x float> poison, <4 x i32> zeroinitializer
%34 = tail call fast <4 x float> @llvm.fma.v4f32(<4 x float> %6, <4 x float> %.splat17, <4 x float> %32)
store float %33, float* %pOut.1.lcssa, align 4
%35 = extractelement <4 x float> %34, i32 1
store float %35, float* %pState.0, align 4
%36 = extractelement <4 x float> %34, i32 2
br label %if.end
if.else: ; preds = %while.end
%37 = extractelement <4 x float> %state.0.lcssa, i32 0
store float %37, float* %pState.0, align 4
%38 = extractelement <4 x float> %state.0.lcssa, i32 1
br label %if.end
if.end: ; preds = %if.else, %if.then
%.sink = phi float [ %38, %if.else ], [ %36, %if.then ]
%39 = getelementptr inbounds float, float* %pState.0, i32 1
store float %.sink, float* %39, align 4
%pState.1 = getelementptr inbounds float, float* %pState.0, i32 2
%dec23 = add i32 %stage.0, -1
%cmp24.not = icmp eq i32 %dec23, 0
br i1 %cmp24.not, label %do.end, label %do.body
do.end: ; preds = %if.end
ret void
}
define arm_aapcs_vfpcc float @vecAddAcrossF32Mve(<4 x float> %in) {
; CHECK-LABEL: vecAddAcrossF32Mve:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vadd.f32 s4, s0, s1
; CHECK-NEXT: vadd.f32 s4, s4, s2
; CHECK-NEXT: vadd.f32 s0, s4, s3
; CHECK-NEXT: bx lr
entry:
%0 = extractelement <4 x float> %in, i32 0
%1 = extractelement <4 x float> %in, i32 1
%add = fadd fast float %0, %1
%2 = extractelement <4 x float> %in, i32 2
%add1 = fadd fast float %add, %2
%3 = extractelement <4 x float> %in, i32 3
%add2 = fadd fast float %add1, %3
ret float %add2
}
declare { i32, <4 x i32> } @llvm.arm.mve.vshlc.v4i32(<4 x i32>, i32, i32) #1
declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture)
declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture)
declare { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32, i32, i32)
declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32*, <4 x i32>, i32, i32, i32)
2020-03-20 09:25:19 +01:00
declare void @llvm.assume(i1)
declare <4 x i1> @llvm.arm.mve.vctp32(i32)
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>)