2020-03-20 09:25:19 +01:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2020-04-22 17:33:11 +02:00
|
|
|
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -verify-machineinstrs %s -o - | FileCheck %s
|
2020-03-20 09:25:19 +01:00
|
|
|
|
2021-01-25 18:50:19 +01:00
|
|
|
define arm_aapcs_vfpcc void @test_fadd(half* noalias nocapture readonly %A, half %B, half* noalias nocapture %C, i32 %n) {
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-LABEL: test_fadd:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: cmp r2, #1
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: bxlt lr
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB0_1: @ %vector.ph
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: vmov.f16 r3, s0
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB0_2: @ %vector.body
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-03-25 12:35:53 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: subs r2, #8
|
|
|
|
; CHECK-NEXT: vadd.f16 q0, q0, r3
|
|
|
|
; CHECK-NEXT: vstrb.8 q0, [r1], #16
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: bne .LBB0_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: bx lr
|
|
|
|
entry:
|
|
|
|
%0 = and i32 %n, 7
|
|
|
|
%cmp = icmp eq i32 %0, 0
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
|
|
|
|
%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
|
|
%3 = fadd fast <8 x half> %wide.load, %broadcast.splat11
|
|
|
|
%4 = getelementptr inbounds half, half* %C, i32 %index
|
|
|
|
%5 = bitcast half* %4 to <8 x half>*
|
|
|
|
store <8 x half> %3, <8 x half>* %5, align 4
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%6 = icmp eq i32 %index.next, %n
|
|
|
|
br i1 %6, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2021-01-25 18:50:19 +01:00
|
|
|
define arm_aapcs_vfpcc void @test_fadd_r(half* noalias nocapture readonly %A, half %B, half* noalias nocapture %C, i32 %n) {
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-LABEL: test_fadd_r:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: cmp r2, #1
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: bxlt lr
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB1_1: @ %vector.ph
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: vmov.f16 r3, s0
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB1_2: @ %vector.body
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-03-25 12:35:53 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: subs r2, #8
|
|
|
|
; CHECK-NEXT: vadd.f16 q0, q0, r3
|
|
|
|
; CHECK-NEXT: vstrb.8 q0, [r1], #16
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: bne .LBB1_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: bx lr
|
|
|
|
entry:
|
|
|
|
%0 = and i32 %n, 7
|
|
|
|
%cmp = icmp eq i32 %0, 0
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
|
|
|
|
%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
|
|
%3 = fadd fast <8 x half> %broadcast.splat11, %wide.load
|
|
|
|
%4 = getelementptr inbounds half, half* %C, i32 %index
|
|
|
|
%5 = bitcast half* %4 to <8 x half>*
|
|
|
|
store <8 x half> %3, <8 x half>* %5, align 4
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%6 = icmp eq i32 %index.next, %n
|
|
|
|
br i1 %6, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2021-01-25 18:50:19 +01:00
|
|
|
define arm_aapcs_vfpcc void @test_fmul(half* noalias nocapture readonly %A, half %B, half* noalias nocapture %C, i32 %n) {
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-LABEL: test_fmul:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: cmp r2, #1
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: bxlt lr
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB2_1: @ %vector.ph
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: vmov.f16 r3, s0
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB2_2: @ %vector.body
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-03-25 12:35:53 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: subs r2, #8
|
|
|
|
; CHECK-NEXT: vmul.f16 q0, q0, r3
|
|
|
|
; CHECK-NEXT: vstrb.8 q0, [r1], #16
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: bne .LBB2_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: bx lr
|
|
|
|
entry:
|
|
|
|
%0 = and i32 %n, 7
|
|
|
|
%cmp = icmp eq i32 %0, 0
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
|
|
|
|
%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
|
|
%3 = fmul fast <8 x half> %wide.load, %broadcast.splat11
|
|
|
|
%4 = getelementptr inbounds half, half* %C, i32 %index
|
|
|
|
%5 = bitcast half* %4 to <8 x half>*
|
|
|
|
store <8 x half> %3, <8 x half>* %5, align 4
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%6 = icmp eq i32 %index.next, %n
|
|
|
|
br i1 %6, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2021-01-25 18:50:19 +01:00
|
|
|
define arm_aapcs_vfpcc void @test_fmul_r(half* noalias nocapture readonly %A, half %B, half* noalias nocapture %C, i32 %n) {
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-LABEL: test_fmul_r:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: cmp r2, #1
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: bxlt lr
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB3_1: @ %vector.ph
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: vmov.f16 r3, s0
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB3_2: @ %vector.body
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-03-25 12:35:53 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: subs r2, #8
|
|
|
|
; CHECK-NEXT: vmul.f16 q0, q0, r3
|
|
|
|
; CHECK-NEXT: vstrb.8 q0, [r1], #16
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: bne .LBB3_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: bx lr
|
|
|
|
entry:
|
|
|
|
%0 = and i32 %n, 7
|
|
|
|
%cmp = icmp eq i32 %0, 0
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
|
|
|
|
%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
|
|
%3 = fmul fast <8 x half> %broadcast.splat11, %wide.load
|
|
|
|
%4 = getelementptr inbounds half, half* %C, i32 %index
|
|
|
|
%5 = bitcast half* %4 to <8 x half>*
|
|
|
|
store <8 x half> %3, <8 x half>* %5, align 4
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%6 = icmp eq i32 %index.next, %n
|
|
|
|
br i1 %6, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2021-01-25 18:50:19 +01:00
|
|
|
define arm_aapcs_vfpcc void @test_fsub(half* noalias nocapture readonly %A, half %B, half* noalias nocapture %C, i32 %n) {
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-LABEL: test_fsub:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: cmp r2, #1
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: bxlt lr
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB4_1: @ %vector.ph
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: vmov.f16 r3, s0
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB4_2: @ %vector.body
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-03-25 12:35:53 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: subs r2, #8
|
|
|
|
; CHECK-NEXT: vsub.f16 q0, q0, r3
|
|
|
|
; CHECK-NEXT: vstrb.8 q0, [r1], #16
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: bne .LBB4_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: bx lr
|
|
|
|
entry:
|
|
|
|
%0 = and i32 %n, 7
|
|
|
|
%cmp = icmp eq i32 %0, 0
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
|
|
|
|
%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
|
|
%3 = fsub fast <8 x half> %wide.load, %broadcast.splat11
|
|
|
|
%4 = getelementptr inbounds half, half* %C, i32 %index
|
|
|
|
%5 = bitcast half* %4 to <8 x half>*
|
|
|
|
store <8 x half> %3, <8 x half>* %5, align 4
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%6 = icmp eq i32 %index.next, %n
|
|
|
|
br i1 %6, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2021-01-25 18:50:19 +01:00
|
|
|
define arm_aapcs_vfpcc void @test_fsub_r(half* noalias nocapture readonly %A, half %B, half* noalias nocapture %C, i32 %n) {
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-LABEL: test_fsub_r:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: cmp r2, #1
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: bxlt lr
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB5_1: @ %vector.ph
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: vmov.f16 r3, s0
|
|
|
|
; CHECK-NEXT: vdup.16 q0, r3
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB5_2: @ %vector.body
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: subs r2, #8
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: vsub.f16 q1, q0, q1
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: vstrb.8 q1, [r1], #16
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: bne .LBB5_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: bx lr
|
|
|
|
entry:
|
|
|
|
%0 = and i32 %n, 7
|
|
|
|
%cmp = icmp eq i32 %0, 0
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%cmp18 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp18, label %vector.ph, label %for.cond.cleanup
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%broadcast.splatinsert10 = insertelement <8 x half> undef, half %B, i32 0
|
|
|
|
%broadcast.splat11 = shufflevector <8 x half> %broadcast.splatinsert10, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
|
|
%3 = fsub fast <8 x half> %broadcast.splat11, %wide.load
|
|
|
|
%4 = getelementptr inbounds half, half* %C, i32 %index
|
|
|
|
%5 = bitcast half* %4 to <8 x half>*
|
|
|
|
store <8 x half> %3, <8 x half>* %5, align 4
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%6 = icmp eq i32 %index.next, %n
|
|
|
|
br i1 %6, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-01-25 18:50:19 +01:00
|
|
|
define arm_aapcs_vfpcc void @test_fmas(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %C, half* noalias nocapture %D, i32 %n) {
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-LABEL: test_fmas:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: cmp r3, #1
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: bxlt lr
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB6_1: @ %vector.ph
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: vmov.f16 r12, s0
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB6_2: @ %vector.body
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-03-25 12:35:53 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: subs r3, #8
|
|
|
|
; CHECK-NEXT: vfmas.f16 q1, q0, r12
|
|
|
|
; CHECK-NEXT: vstrb.8 q1, [r2], #16
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: bne .LBB6_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: bx lr
|
|
|
|
entry:
|
|
|
|
%0 = and i32 %n, 7
|
|
|
|
%cmp = icmp eq i32 %0, 0
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
|
|
|
|
%broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
|
|
%3 = getelementptr inbounds half, half* %B, i32 %index
|
|
|
|
%4 = bitcast half* %3 to <8 x half>*
|
|
|
|
%wide.load12 = load <8 x half>, <8 x half>* %4, align 4
|
|
|
|
%5 = fmul fast <8 x half> %wide.load12, %wide.load
|
|
|
|
%6 = fadd fast <8 x half> %5, %broadcast.splat14
|
|
|
|
%7 = getelementptr inbounds half, half* %D, i32 %index
|
|
|
|
%8 = bitcast half* %7 to <8 x half>*
|
|
|
|
store <8 x half> %6, <8 x half>* %8, align 4
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%9 = icmp eq i32 %index.next, %n
|
|
|
|
br i1 %9, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2021-01-25 18:50:19 +01:00
|
|
|
define arm_aapcs_vfpcc void @test_fmas_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %C, half* noalias nocapture %D, i32 %n) {
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-LABEL: test_fmas_r:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: cmp r3, #1
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: bxlt lr
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB7_1: @ %vector.ph
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: vmov.f16 r12, s0
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB7_2: @ %vector.body
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-03-25 12:35:53 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: subs r3, #8
|
|
|
|
; CHECK-NEXT: vfmas.f16 q1, q0, r12
|
|
|
|
; CHECK-NEXT: vstrb.8 q1, [r2], #16
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: bne .LBB7_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: bx lr
|
|
|
|
entry:
|
|
|
|
%0 = and i32 %n, 7
|
|
|
|
%cmp = icmp eq i32 %0, 0
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
|
|
|
|
%broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
|
|
%3 = getelementptr inbounds half, half* %B, i32 %index
|
|
|
|
%4 = bitcast half* %3 to <8 x half>*
|
|
|
|
%wide.load12 = load <8 x half>, <8 x half>* %4, align 4
|
|
|
|
%5 = fmul fast <8 x half> %wide.load12, %wide.load
|
|
|
|
%6 = fadd fast <8 x half> %broadcast.splat14, %5
|
|
|
|
%7 = getelementptr inbounds half, half* %D, i32 %index
|
|
|
|
%8 = bitcast half* %7 to <8 x half>*
|
|
|
|
store <8 x half> %6, <8 x half>* %8, align 4
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%9 = icmp eq i32 %index.next, %n
|
|
|
|
br i1 %9, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2021-01-25 18:50:19 +01:00
|
|
|
define arm_aapcs_vfpcc void @test_fma(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %C, half* noalias nocapture %D, i32 %n) {
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-LABEL: test_fma:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: cmp r3, #1
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: bxlt lr
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB8_1: @ %vector.ph
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: vmov.f16 r12, s0
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB8_2: @ %vector.body
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-03-25 12:35:53 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: subs r3, #8
|
|
|
|
; CHECK-NEXT: vfma.f16 q1, q0, r12
|
|
|
|
; CHECK-NEXT: vstrb.8 q1, [r2], #16
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: bne .LBB8_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: bx lr
|
|
|
|
entry:
|
|
|
|
%0 = and i32 %n, 7
|
|
|
|
%cmp = icmp eq i32 %0, 0
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
|
|
|
|
%broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
|
|
%3 = fmul fast <8 x half> %wide.load, %broadcast.splat13
|
|
|
|
%4 = getelementptr inbounds half, half* %B, i32 %index
|
|
|
|
%5 = bitcast half* %4 to <8 x half>*
|
|
|
|
%wide.load14 = load <8 x half>, <8 x half>* %5, align 4
|
|
|
|
%6 = fadd fast <8 x half> %3, %wide.load14
|
|
|
|
%7 = getelementptr inbounds half, half* %D, i32 %index
|
|
|
|
%8 = bitcast half* %7 to <8 x half>*
|
|
|
|
store <8 x half> %6, <8 x half>* %8, align 4
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%9 = icmp eq i32 %index.next, %n
|
|
|
|
br i1 %9, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2021-01-25 18:50:19 +01:00
|
|
|
define arm_aapcs_vfpcc void @test_fma_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %C, half* noalias nocapture %D, i32 %n) {
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-LABEL: test_fma_r:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: cmp r3, #1
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: bxlt lr
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB9_1: @ %vector.ph
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: vmov.f16 r12, s0
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB9_2: @ %vector.body
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-03-25 12:35:53 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #16
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1], #16
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: subs r3, #8
|
|
|
|
; CHECK-NEXT: vfma.f16 q1, q0, r12
|
|
|
|
; CHECK-NEXT: vstrb.8 q1, [r2], #16
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: bne .LBB9_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: bx lr
|
|
|
|
entry:
|
|
|
|
%0 = and i32 %n, 7
|
|
|
|
%cmp = icmp eq i32 %0, 0
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
|
|
|
|
%broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
|
|
%3 = fmul fast <8 x half> %broadcast.splat13, %wide.load
|
|
|
|
%4 = getelementptr inbounds half, half* %B, i32 %index
|
|
|
|
%5 = bitcast half* %4 to <8 x half>*
|
|
|
|
%wide.load14 = load <8 x half>, <8 x half>* %5, align 4
|
|
|
|
%6 = fadd fast <8 x half> %3, %wide.load14
|
|
|
|
%7 = getelementptr inbounds half, half* %D, i32 %index
|
|
|
|
%8 = bitcast half* %7 to <8 x half>*
|
|
|
|
store <8 x half> %6, <8 x half>* %8, align 4
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%9 = icmp eq i32 %index.next, %n
|
|
|
|
br i1 %9, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-01-25 18:50:19 +01:00
|
|
|
define arm_aapcs_vfpcc void @test_fmss(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %C, half* noalias nocapture %D, i32 %n) {
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-LABEL: test_fmss:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: cmp r3, #1
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: bxlt lr
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB10_1: @ %vector.ph
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: vmov.f16 r12, s0
|
|
|
|
; CHECK-NEXT: vdup.16 q0, r12
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: vneg.f16 q0, q0
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB10_2: @ %vector.body
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
|
|
|
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
|
|
|
|
; CHECK-NEXT: vmov q3, q0
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: subs r3, #8
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: vfma.f16 q3, q2, q1
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: vstrb.8 q3, [r2], #16
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: bne .LBB10_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: bx lr
|
|
|
|
entry:
|
|
|
|
%0 = and i32 %n, 7
|
|
|
|
%cmp = icmp eq i32 %0, 0
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
|
|
|
|
%broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
|
|
%3 = getelementptr inbounds half, half* %B, i32 %index
|
|
|
|
%4 = bitcast half* %3 to <8 x half>*
|
|
|
|
%wide.load12 = load <8 x half>, <8 x half>* %4, align 4
|
|
|
|
%5 = fmul fast <8 x half> %wide.load12, %wide.load
|
|
|
|
%6 = fsub fast <8 x half> %5, %broadcast.splat14
|
|
|
|
%7 = getelementptr inbounds half, half* %D, i32 %index
|
|
|
|
%8 = bitcast half* %7 to <8 x half>*
|
|
|
|
store <8 x half> %6, <8 x half>* %8, align 4
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%9 = icmp eq i32 %index.next, %n
|
|
|
|
br i1 %9, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2021-01-25 18:50:19 +01:00
|
|
|
define arm_aapcs_vfpcc void @test_fmss_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %C, half* noalias nocapture %D, i32 %n) {
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-LABEL: test_fmss_r:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: cmp r3, #1
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: bxlt lr
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB11_1: @ %vector.ph
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: vmov.f16 r12, s0
|
|
|
|
; CHECK-NEXT: vdup.16 q0, r12
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB11_2: @ %vector.body
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
|
|
|
; CHECK-NEXT: vldrw.u32 q2, [r1], #16
|
|
|
|
; CHECK-NEXT: vmov q3, q0
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: subs r3, #8
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: vfms.f16 q3, q2, q1
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: vstrb.8 q3, [r2], #16
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: bne .LBB11_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: bx lr
|
|
|
|
entry:
|
|
|
|
%0 = and i32 %n, 7
|
|
|
|
%cmp = icmp eq i32 %0, 0
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%broadcast.splatinsert13 = insertelement <8 x half> undef, half %C, i32 0
|
|
|
|
%broadcast.splat14 = shufflevector <8 x half> %broadcast.splatinsert13, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
|
|
%3 = getelementptr inbounds half, half* %B, i32 %index
|
|
|
|
%4 = bitcast half* %3 to <8 x half>*
|
|
|
|
%wide.load12 = load <8 x half>, <8 x half>* %4, align 4
|
|
|
|
%5 = fmul fast <8 x half> %wide.load12, %wide.load
|
|
|
|
%6 = fsub fast <8 x half> %broadcast.splat14, %5
|
|
|
|
%7 = getelementptr inbounds half, half* %D, i32 %index
|
|
|
|
%8 = bitcast half* %7 to <8 x half>*
|
|
|
|
store <8 x half> %6, <8 x half>* %8, align 4
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%9 = icmp eq i32 %index.next, %n
|
|
|
|
br i1 %9, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2021-01-25 18:50:19 +01:00
|
|
|
define arm_aapcs_vfpcc void @test_fms(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %C, half* noalias nocapture %D, i32 %n) {
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-LABEL: test_fms:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: cmp r3, #1
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: bxlt lr
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB12_1: @ %vector.ph
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: vmov.f16 r12, s0
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB12_2: @ %vector.body
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-03-25 12:35:53 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: subs r3, #8
|
2020-03-25 12:35:53 +01:00
|
|
|
; CHECK-NEXT: vneg.f16 q0, q0
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r12
|
|
|
|
; CHECK-NEXT: vstrb.8 q0, [r2], #16
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: bne .LBB12_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: bx lr
|
|
|
|
entry:
|
|
|
|
%0 = and i32 %n, 7
|
|
|
|
%cmp = icmp eq i32 %0, 0
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
|
|
|
|
%broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
|
|
%3 = fmul fast <8 x half> %wide.load, %broadcast.splat13
|
|
|
|
%4 = getelementptr inbounds half, half* %B, i32 %index
|
|
|
|
%5 = bitcast half* %4 to <8 x half>*
|
|
|
|
%wide.load14 = load <8 x half>, <8 x half>* %5, align 4
|
|
|
|
%6 = fsub fast <8 x half> %3, %wide.load14
|
|
|
|
%7 = getelementptr inbounds half, half* %D, i32 %index
|
|
|
|
%8 = bitcast half* %7 to <8 x half>*
|
|
|
|
store <8 x half> %6, <8 x half>* %8, align 4
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%9 = icmp eq i32 %index.next, %n
|
|
|
|
br i1 %9, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2021-01-25 18:50:19 +01:00
|
|
|
define arm_aapcs_vfpcc void @test_fms_r(half* noalias nocapture readonly %A, half* noalias nocapture readonly %B, half %C, half* noalias nocapture %D, i32 %n) {
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-LABEL: test_fms_r:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: cmp r3, #1
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: it lt
|
|
|
|
; CHECK-NEXT: bxlt lr
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB13_1: @ %vector.ph
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: vmov.f16 r12, s0
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: .LBB13_2: @ %vector.body
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-03-25 12:35:53 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0], #16
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: subs r3, #8
|
2020-03-25 12:35:53 +01:00
|
|
|
; CHECK-NEXT: vneg.f16 q0, q0
|
2021-01-25 18:50:19 +01:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r12
|
|
|
|
; CHECK-NEXT: vstrb.8 q0, [r2], #16
|
2020-08-27 08:09:25 +02:00
|
|
|
; CHECK-NEXT: bne .LBB13_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: bx lr
|
|
|
|
entry:
|
|
|
|
%0 = and i32 %n, 7
|
|
|
|
%cmp = icmp eq i32 %0, 0
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%cmp110 = icmp sgt i32 %n, 0
|
|
|
|
br i1 %cmp110, label %vector.ph, label %for.cond.cleanup
|
|
|
|
|
|
|
|
vector.ph: ; preds = %entry
|
|
|
|
%broadcast.splatinsert12 = insertelement <8 x half> undef, half %C, i32 0
|
|
|
|
%broadcast.splat13 = shufflevector <8 x half> %broadcast.splatinsert12, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %vector.ph
|
|
|
|
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
|
|
|
|
%1 = getelementptr inbounds half, half* %A, i32 %index
|
|
|
|
%2 = bitcast half* %1 to <8 x half>*
|
|
|
|
%wide.load = load <8 x half>, <8 x half>* %2, align 4
|
|
|
|
%3 = fmul fast <8 x half> %broadcast.splat13, %wide.load
|
|
|
|
%4 = getelementptr inbounds half, half* %B, i32 %index
|
|
|
|
%5 = bitcast half* %4 to <8 x half>*
|
|
|
|
%wide.load14 = load <8 x half>, <8 x half>* %5, align 4
|
|
|
|
%6 = fsub fast <8 x half> %3, %wide.load14
|
|
|
|
%7 = getelementptr inbounds half, half* %D, i32 %index
|
|
|
|
%8 = bitcast half* %7 to <8 x half>*
|
|
|
|
store <8 x half> %6, <8 x half>* %8, align 4
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%9 = icmp eq i32 %index.next, %n
|
|
|
|
br i1 %9, label %for.cond.cleanup, label %vector.body
|
|
|
|
|
|
|
|
for.cond.cleanup: ; preds = %vector.body, %entry
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2021-01-25 18:50:19 +01:00
|
|
|
define dso_local void @test_nested(half* noalias nocapture %pInT1, half* noalias nocapture readonly %pOutT1, half* noalias nocapture readonly %pPRT_in, half* noalias nocapture readnone %pPRT_pDst, i32 %numRows, i32 %numCols, i32 %l) local_unnamed_addr #0 {
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-LABEL: test_nested:
|
|
|
|
; CHECK: @ %bb.0: @ %for.body.us.preheader
|
2020-05-29 11:53:30 +02:00
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, lr}
|
|
|
|
; CHECK-NEXT: push {r4, r5, r6, lr}
|
2020-11-10 17:28:57 +01:00
|
|
|
; CHECK-NEXT: ldrd lr, r12, [sp, #16]
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: lsl.w r3, r12, #1
|
|
|
|
; CHECK-NEXT: .LBB14_1: @ %for.body.us
|
|
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: @ Child Loop BB14_2 Depth 2
|
2020-03-24 11:14:52 +01:00
|
|
|
; CHECK-NEXT: ldrh r4, [r1]
|
2020-05-29 11:53:30 +02:00
|
|
|
; CHECK-NEXT: mov r5, r2
|
|
|
|
; CHECK-NEXT: mov r6, r12
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: vdup.16 q0, r4
|
2020-05-29 11:53:30 +02:00
|
|
|
; CHECK-NEXT: mov r4, r0
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: .LBB14_2: @ %vector.body
|
|
|
|
; CHECK-NEXT: @ Parent Loop BB14_1 Depth=1
|
|
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
2020-05-29 11:53:30 +02:00
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5], #16
|
|
|
|
; CHECK-NEXT: vldrw.u32 q2, [r4]
|
|
|
|
; CHECK-NEXT: subs r6, #8
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: vfms.f16 q2, q1, q0
|
2020-05-29 11:53:30 +02:00
|
|
|
; CHECK-NEXT: vstrb.8 q2, [r4], #16
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: bne .LBB14_2
|
|
|
|
; CHECK-NEXT: @ %bb.3: @ %for.cond6.for.end_crit_edge.us
|
|
|
|
; CHECK-NEXT: @ in Loop: Header=BB14_1 Depth=1
|
|
|
|
; CHECK-NEXT: add r0, r3
|
|
|
|
; CHECK-NEXT: add r2, r3
|
|
|
|
; CHECK-NEXT: adds r1, #2
|
|
|
|
; CHECK-NEXT: le lr, .LBB14_1
|
|
|
|
; CHECK-NEXT: @ %bb.4: @ %for.end14
|
2020-05-29 11:53:30 +02:00
|
|
|
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
2020-03-20 09:25:19 +01:00
|
|
|
for.body.us.preheader:
|
|
|
|
%cmp = icmp sgt i32 %numRows, 0
|
|
|
|
tail call void @llvm.assume(i1 %cmp)
|
|
|
|
%cmp1 = icmp sgt i32 %numCols, 0
|
|
|
|
tail call void @llvm.assume(i1 %cmp1)
|
|
|
|
%rem = and i32 %numCols, 7
|
|
|
|
%cmp2 = icmp eq i32 %rem, 0
|
|
|
|
tail call void @llvm.assume(i1 %cmp2)
|
|
|
|
%cmp3 = icmp slt i32 %l, %numCols
|
|
|
|
tail call void @llvm.assume(i1 %cmp3)
|
|
|
|
br label %for.body.us
|
|
|
|
|
|
|
|
for.body.us: ; preds = %for.cond6.for.end_crit_edge.us, %for.body.us.preheader
|
|
|
|
%pInT1.addr.038.us = phi half* [ %scevgep40, %for.cond6.for.end_crit_edge.us ], [ %pInT1, %for.body.us.preheader ]
|
|
|
|
%i.037.us = phi i32 [ %inc13.us, %for.cond6.for.end_crit_edge.us ], [ 0, %for.body.us.preheader ]
|
|
|
|
%pOutT1.addr.036.us = phi half* [ %incdec.ptr.us, %for.cond6.for.end_crit_edge.us ], [ %pOutT1, %for.body.us.preheader ]
|
|
|
|
%pPRT_in.addr.035.us = phi half* [ %scevgep, %for.cond6.for.end_crit_edge.us ], [ %pPRT_in, %for.body.us.preheader ]
|
|
|
|
%scevgep = getelementptr half, half* %pPRT_in.addr.035.us, i32 %numCols
|
|
|
|
%0 = load half, half* %pOutT1.addr.036.us, align 4
|
|
|
|
%broadcast.splatinsert47 = insertelement <8 x half> undef, half %0, i32 0
|
|
|
|
%broadcast.splat48 = shufflevector <8 x half> %broadcast.splatinsert47, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
br label %vector.body
|
|
|
|
|
|
|
|
vector.body: ; preds = %vector.body, %for.body.us
|
|
|
|
%index = phi i32 [ 0, %for.body.us ], [ %index.next, %vector.body ]
|
|
|
|
%next.gep = getelementptr half, half* %pInT1.addr.038.us, i32 %index
|
|
|
|
%next.gep45 = getelementptr half, half* %pPRT_in.addr.035.us, i32 %index
|
|
|
|
%1 = bitcast half* %next.gep to <8 x half>*
|
|
|
|
%wide.load = load <8 x half>, <8 x half>* %1, align 4
|
|
|
|
%2 = bitcast half* %next.gep45 to <8 x half>*
|
|
|
|
%wide.load46 = load <8 x half>, <8 x half>* %2, align 4
|
|
|
|
%3 = fmul fast <8 x half> %wide.load46, %broadcast.splat48
|
|
|
|
%4 = fsub fast <8 x half> %wide.load, %3
|
|
|
|
store <8 x half> %4, <8 x half>* %1, align 4
|
|
|
|
%index.next = add i32 %index, 8
|
|
|
|
%5 = icmp eq i32 %index.next, %numCols
|
|
|
|
br i1 %5, label %for.cond6.for.end_crit_edge.us, label %vector.body
|
|
|
|
|
|
|
|
for.cond6.for.end_crit_edge.us: ; preds = %vector.body
|
|
|
|
%incdec.ptr.us = getelementptr inbounds half, half* %pOutT1.addr.036.us, i32 1
|
|
|
|
%scevgep40 = getelementptr half, half* %pInT1.addr.038.us, i32 %numCols
|
|
|
|
%inc13.us = add nuw nsw i32 %i.037.us, 1
|
|
|
|
%exitcond41 = icmp eq i32 %inc13.us, %numRows
|
|
|
|
br i1 %exitcond41, label %for.end14, label %for.body.us
|
|
|
|
|
|
|
|
for.end14: ; preds = %for.cond6.for.end_crit_edge.us
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
%struct.arm_fir_instance_f32 = type { i16, half*, half* }
|
|
|
|
define void @arm_fir_f32_1_4_mve(%struct.arm_fir_instance_f32* nocapture readonly %S, half* nocapture readonly %pSrc, half* %pDst, i32 %blockSize) {
|
|
|
|
; CHECK-LABEL: arm_fir_f32_1_4_mve:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
2020-03-25 12:35:53 +01:00
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: .pad #16
|
|
|
|
; CHECK-NEXT: sub sp, #16
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: ldrh.w r9, [r0]
|
|
|
|
; CHECK-NEXT: ldr.w r10, [r0, #4]
|
|
|
|
; CHECK-NEXT: sub.w r6, r9, #1
|
2020-05-13 15:35:32 +02:00
|
|
|
; CHECK-NEXT: cmp r6, #3
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: bhi .LBB15_6
|
|
|
|
; CHECK-NEXT: @ %bb.1: @ %if.then
|
2020-05-13 15:35:32 +02:00
|
|
|
; CHECK-NEXT: ldr r7, [r0, #8]
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: add.w r4, r10, r6, lsl #1
|
|
|
|
; CHECK-NEXT: lsrs r5, r3, #2
|
2020-05-29 11:53:30 +02:00
|
|
|
; CHECK-NEXT: ldrh.w r8, [r7, #6]
|
2020-07-16 11:36:23 +02:00
|
|
|
; CHECK-NEXT: ldrh.w r12, [r7, #4]
|
2020-05-29 11:53:30 +02:00
|
|
|
; CHECK-NEXT: ldrh r6, [r7, #2]
|
2020-05-13 15:35:32 +02:00
|
|
|
; CHECK-NEXT: ldrh r7, [r7]
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: wls lr, r5, .LBB15_5
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: str.w r9, [sp, #12] @ 4-byte Spill
|
2020-05-29 11:53:30 +02:00
|
|
|
; CHECK-NEXT: bic r5, r3, #3
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: add.w r9, r10, #2
|
2020-05-29 11:53:30 +02:00
|
|
|
; CHECK-NEXT: str r5, [sp] @ 4-byte Spill
|
2020-05-13 15:35:32 +02:00
|
|
|
; CHECK-NEXT: add.w r5, r2, r5, lsl #1
|
|
|
|
; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill
|
2020-05-29 11:53:30 +02:00
|
|
|
; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: .LBB15_3: @ %while.body
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
2020-05-29 11:53:30 +02:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r1], #8
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: sub.w r11, r9, #2
|
|
|
|
; CHECK-NEXT: add.w r5, r9, #2
|
2020-05-29 11:53:30 +02:00
|
|
|
; CHECK-NEXT: vstrb.8 q0, [r4], #8
|
2020-05-13 15:35:32 +02:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r11]
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r9]
|
2020-05-13 15:35:32 +02:00
|
|
|
; CHECK-NEXT: vmul.f16 q0, q0, r7
|
2020-05-29 11:53:30 +02:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r6
|
2020-05-13 15:35:32 +02:00
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5]
|
2020-07-16 11:36:23 +02:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r12
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r9, #4]
|
|
|
|
; CHECK-NEXT: add.w r9, r9, #8
|
2020-05-29 11:53:30 +02:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r8
|
|
|
|
; CHECK-NEXT: vstrb.8 q0, [r2], #8
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: le lr, .LBB15_3
|
|
|
|
; CHECK-NEXT: @ %bb.4: @ %while.end.loopexit
|
2020-05-29 11:53:30 +02:00
|
|
|
; CHECK-NEXT: ldr r2, [sp] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: ldr.w r9, [sp, #12] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: add.w r10, r10, r2, lsl #1
|
2020-05-29 11:53:30 +02:00
|
|
|
; CHECK-NEXT: add.w r1, r1, r2, lsl #1
|
|
|
|
; CHECK-NEXT: ldr r2, [sp, #4] @ 4-byte Reload
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: .LBB15_5: @ %while.end
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: and r5, r3, #3
|
2020-05-13 15:35:32 +02:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r1]
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: vctp.16 r5
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: vpst
|
2020-05-13 15:35:32 +02:00
|
|
|
; CHECK-NEXT: vstrht.16 q0, [r4]
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r10]
|
|
|
|
; CHECK-NEXT: add.w r1, r10, #2
|
2020-05-13 15:35:32 +02:00
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1]
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: add.w r1, r10, #6
|
2020-05-13 15:35:32 +02:00
|
|
|
; CHECK-NEXT: vmul.f16 q0, q0, r7
|
2020-05-29 11:53:30 +02:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r6
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r10, #4]
|
2020-07-16 11:36:23 +02:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r12
|
2020-05-13 15:35:32 +02:00
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r1]
|
2020-05-29 11:53:30 +02:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r8
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: vpst
|
2020-05-13 15:35:32 +02:00
|
|
|
; CHECK-NEXT: vstrht.16 q0, [r2]
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: ldr.w r10, [r0, #4]
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: .LBB15_6: @ %if.end
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: add.w r0, r10, r3, lsl #1
|
|
|
|
; CHECK-NEXT: lsr.w r1, r9, #2
|
|
|
|
; CHECK-NEXT: wls lr, r1, .LBB15_10
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: @ %bb.7: @ %while.body51.preheader
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: bic r2, r9, #3
|
2020-05-29 11:53:30 +02:00
|
|
|
; CHECK-NEXT: adds r1, r2, r3
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: mov r3, r10
|
|
|
|
; CHECK-NEXT: add.w r1, r10, r1, lsl #1
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: .LBB15_8: @ %while.body51
|
|
|
|
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #8
|
|
|
|
; CHECK-NEXT: vstrb.8 q0, [r3], #8
|
|
|
|
; CHECK-NEXT: le lr, .LBB15_8
|
|
|
|
; CHECK-NEXT: @ %bb.9: @ %while.end55.loopexit
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: add.w r10, r10, r2, lsl #1
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: mov r0, r1
|
|
|
|
; CHECK-NEXT: .LBB15_10: @ %while.end55
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: ands r1, r9, #3
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: beq .LBB15_12
|
|
|
|
; CHECK-NEXT: @ %bb.11: @ %if.then59
|
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0]
|
|
|
|
; CHECK-NEXT: vctp.16 r1
|
|
|
|
; CHECK-NEXT: vpst
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: vstrht.16 q0, [r10]
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: .LBB15_12: @ %if.end61
|
2020-03-25 12:35:53 +01:00
|
|
|
; CHECK-NEXT: add sp, #16
|
|
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
2020-03-20 09:25:19 +01:00
|
|
|
entry:
|
|
|
|
%pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1
|
|
|
|
%0 = load half*, half** %pState1, align 4
|
|
|
|
%pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 2
|
|
|
|
%1 = load half*, half** %pCoeffs2, align 4
|
|
|
|
%numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 0
|
|
|
|
%2 = load i16, i16* %numTaps3, align 4
|
|
|
|
%conv = zext i16 %2 to i32
|
|
|
|
%sub = add nsw i32 %conv, -1
|
|
|
|
%cmp = icmp ult i32 %sub, 4
|
|
|
|
br i1 %cmp, label %if.then, label %if.end
|
|
|
|
|
|
|
|
if.then: ; preds = %entry
|
|
|
|
%arrayidx = getelementptr inbounds half, half* %0, i32 %sub
|
|
|
|
%incdec.ptr = getelementptr inbounds half, half* %1, i32 1
|
|
|
|
%3 = load half, half* %1, align 4
|
|
|
|
%incdec.ptr6 = getelementptr inbounds half, half* %1, i32 2
|
|
|
|
%4 = load half, half* %incdec.ptr, align 4
|
|
|
|
%incdec.ptr7 = getelementptr inbounds half, half* %1, i32 3
|
|
|
|
%5 = load half, half* %incdec.ptr6, align 4
|
|
|
|
%6 = load half, half* %incdec.ptr7, align 4
|
|
|
|
%shr = lshr i32 %blockSize, 2
|
|
|
|
%cmp9146 = icmp eq i32 %shr, 0
|
|
|
|
%.pre161 = insertelement <8 x half> undef, half %3, i32 0
|
|
|
|
%.pre162 = shufflevector <8 x half> %.pre161, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
%.pre163 = insertelement <8 x half> undef, half %4, i32 0
|
|
|
|
%.pre164 = shufflevector <8 x half> %.pre163, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
%.pre165 = insertelement <8 x half> undef, half %5, i32 0
|
|
|
|
%.pre166 = shufflevector <8 x half> %.pre165, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
%.pre167 = insertelement <8 x half> undef, half %6, i32 0
|
|
|
|
%.pre168 = shufflevector <8 x half> %.pre167, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
br i1 %cmp9146, label %while.end, label %while.body.lr.ph
|
|
|
|
|
|
|
|
while.body.lr.ph: ; preds = %if.then
|
|
|
|
%7 = and i32 %blockSize, -4
|
|
|
|
%scevgep158 = getelementptr half, half* %pDst, i32 %7
|
|
|
|
br label %while.body
|
|
|
|
|
|
|
|
while.body: ; preds = %while.body.lr.ph, %while.body
|
|
|
|
%pStateCur.0151 = phi half* [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.body ]
|
|
|
|
%pSamples.0150 = phi half* [ %0, %while.body.lr.ph ], [ %add.ptr24, %while.body ]
|
|
|
|
%pOutput.0149 = phi half* [ %pDst, %while.body.lr.ph ], [ %add.ptr23, %while.body ]
|
|
|
|
%pTempSrc.0148 = phi half* [ %pSrc, %while.body.lr.ph ], [ %add.ptr11, %while.body ]
|
|
|
|
%blkCnt.0147 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec, %while.body ]
|
|
|
|
%8 = bitcast half* %pTempSrc.0148 to <8 x half>*
|
|
|
|
%9 = load <8 x half>, <8 x half>* %8, align 4
|
|
|
|
%10 = bitcast half* %pStateCur.0151 to <8 x half>*
|
|
|
|
store <8 x half> %9, <8 x half>* %10, align 4
|
|
|
|
%add.ptr = getelementptr inbounds half, half* %pStateCur.0151, i32 4
|
|
|
|
%add.ptr11 = getelementptr inbounds half, half* %pTempSrc.0148, i32 4
|
|
|
|
%11 = bitcast half* %pSamples.0150 to <8 x half>*
|
|
|
|
%12 = load <8 x half>, <8 x half>* %11, align 4
|
|
|
|
%13 = fmul fast <8 x half> %12, %.pre162
|
|
|
|
%arrayidx12 = getelementptr inbounds half, half* %pSamples.0150, i32 1
|
|
|
|
%14 = bitcast half* %arrayidx12 to <8 x half>*
|
|
|
|
%15 = load <8 x half>, <8 x half>* %14, align 4
|
|
|
|
%mul = fmul fast <8 x half> %15, %.pre164
|
|
|
|
%add = fadd fast <8 x half> %mul, %13
|
|
|
|
%arrayidx13 = getelementptr inbounds half, half* %pSamples.0150, i32 2
|
|
|
|
%16 = bitcast half* %arrayidx13 to <8 x half>*
|
|
|
|
%17 = load <8 x half>, <8 x half>* %16, align 4
|
|
|
|
%mul16 = fmul fast <8 x half> %17, %.pre166
|
|
|
|
%add17 = fadd fast <8 x half> %add, %mul16
|
|
|
|
%arrayidx18 = getelementptr inbounds half, half* %pSamples.0150, i32 3
|
|
|
|
%18 = bitcast half* %arrayidx18 to <8 x half>*
|
|
|
|
%19 = load <8 x half>, <8 x half>* %18, align 4
|
|
|
|
%mul21 = fmul fast <8 x half> %19, %.pre168
|
|
|
|
%add22 = fadd fast <8 x half> %add17, %mul21
|
|
|
|
%20 = bitcast half* %pOutput.0149 to <8 x half>*
|
|
|
|
store <8 x half> %add22, <8 x half>* %20, align 4
|
|
|
|
%add.ptr23 = getelementptr inbounds half, half* %pOutput.0149, i32 4
|
|
|
|
%add.ptr24 = getelementptr inbounds half, half* %pSamples.0150, i32 4
|
|
|
|
%dec = add nsw i32 %blkCnt.0147, -1
|
|
|
|
%cmp9 = icmp eq i32 %dec, 0
|
|
|
|
br i1 %cmp9, label %while.end.loopexit, label %while.body
|
|
|
|
|
|
|
|
while.end.loopexit: ; preds = %while.body
|
|
|
|
%scevgep157 = getelementptr half, half* %pSrc, i32 %7
|
|
|
|
%scevgep159 = getelementptr half, half* %0, i32 %7
|
|
|
|
br label %while.end
|
|
|
|
|
|
|
|
while.end: ; preds = %if.then, %while.end.loopexit
|
|
|
|
%pTempSrc.0.lcssa = phi half* [ %scevgep157, %while.end.loopexit ], [ %pSrc, %if.then ]
|
|
|
|
%pOutput.0.lcssa = phi half* [ %scevgep158, %while.end.loopexit ], [ %pDst, %if.then ]
|
|
|
|
%pSamples.0.lcssa = phi half* [ %scevgep159, %while.end.loopexit ], [ %0, %if.then ]
|
|
|
|
%pStateCur.0.lcssa = phi half* [ %add.ptr, %while.end.loopexit ], [ %arrayidx, %if.then ]
|
|
|
|
%and = and i32 %blockSize, 3
|
|
|
|
%21 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %and)
|
|
|
|
%22 = bitcast half* %pTempSrc.0.lcssa to <8 x half>*
|
|
|
|
%23 = load <8 x half>, <8 x half>* %22, align 4
|
|
|
|
%24 = bitcast half* %pStateCur.0.lcssa to <8 x half>*
|
|
|
|
tail call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %23, <8 x half>* %24, i32 4, <8 x i1> %21)
|
|
|
|
%25 = bitcast half* %pSamples.0.lcssa to <8 x half>*
|
|
|
|
%26 = load <8 x half>, <8 x half>* %25, align 4
|
|
|
|
%27 = fmul fast <8 x half> %26, %.pre162
|
|
|
|
%arrayidx29 = getelementptr inbounds half, half* %pSamples.0.lcssa, i32 1
|
|
|
|
%28 = bitcast half* %arrayidx29 to <8 x half>*
|
|
|
|
%29 = load <8 x half>, <8 x half>* %28, align 4
|
|
|
|
%mul32 = fmul fast <8 x half> %29, %.pre164
|
|
|
|
%add33 = fadd fast <8 x half> %mul32, %27
|
|
|
|
%arrayidx34 = getelementptr inbounds half, half* %pSamples.0.lcssa, i32 2
|
|
|
|
%30 = bitcast half* %arrayidx34 to <8 x half>*
|
|
|
|
%31 = load <8 x half>, <8 x half>* %30, align 4
|
|
|
|
%mul37 = fmul fast <8 x half> %31, %.pre166
|
|
|
|
%add38 = fadd fast <8 x half> %add33, %mul37
|
|
|
|
%arrayidx39 = getelementptr inbounds half, half* %pSamples.0.lcssa, i32 3
|
|
|
|
%32 = bitcast half* %arrayidx39 to <8 x half>*
|
|
|
|
%33 = load <8 x half>, <8 x half>* %32, align 4
|
|
|
|
%mul42 = fmul fast <8 x half> %33, %.pre168
|
|
|
|
%add43 = fadd fast <8 x half> %add38, %mul42
|
|
|
|
%34 = bitcast half* %pOutput.0.lcssa to <8 x half>*
|
|
|
|
tail call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %add43, <8 x half>* %34, i32 4, <8 x i1> %21)
|
|
|
|
%.pre = load half*, half** %pState1, align 4
|
|
|
|
br label %if.end
|
|
|
|
|
|
|
|
if.end: ; preds = %while.end, %entry
|
|
|
|
%35 = phi half* [ %.pre, %while.end ], [ %0, %entry ]
|
|
|
|
%arrayidx45 = getelementptr inbounds half, half* %35, i32 %blockSize
|
|
|
|
%shr47 = lshr i32 %conv, 2
|
|
|
|
%cmp49141 = icmp eq i32 %shr47, 0
|
|
|
|
br i1 %cmp49141, label %while.end55, label %while.body51.preheader
|
|
|
|
|
|
|
|
while.body51.preheader: ; preds = %if.end
|
|
|
|
%36 = and i32 %conv, 65532
|
|
|
|
%37 = add i32 %36, %blockSize
|
|
|
|
%scevgep = getelementptr half, half* %35, i32 %37
|
|
|
|
br label %while.body51
|
|
|
|
|
|
|
|
while.body51: ; preds = %while.body51.preheader, %while.body51
|
|
|
|
%pTempSrc.1144 = phi half* [ %add.ptr52, %while.body51 ], [ %arrayidx45, %while.body51.preheader ]
|
|
|
|
%pTempDest.0143 = phi half* [ %add.ptr53, %while.body51 ], [ %35, %while.body51.preheader ]
|
|
|
|
%blkCnt.1142 = phi i32 [ %dec54, %while.body51 ], [ %shr47, %while.body51.preheader ]
|
|
|
|
%38 = bitcast half* %pTempSrc.1144 to <8 x half>*
|
|
|
|
%39 = load <8 x half>, <8 x half>* %38, align 4
|
|
|
|
%40 = bitcast half* %pTempDest.0143 to <8 x half>*
|
|
|
|
store <8 x half> %39, <8 x half>* %40, align 4
|
|
|
|
%add.ptr52 = getelementptr inbounds half, half* %pTempSrc.1144, i32 4
|
|
|
|
%add.ptr53 = getelementptr inbounds half, half* %pTempDest.0143, i32 4
|
|
|
|
%dec54 = add nsw i32 %blkCnt.1142, -1
|
|
|
|
%cmp49 = icmp eq i32 %dec54, 0
|
|
|
|
br i1 %cmp49, label %while.end55.loopexit, label %while.body51
|
|
|
|
|
|
|
|
while.end55.loopexit: ; preds = %while.body51
|
|
|
|
%scevgep156 = getelementptr half, half* %35, i32 %36
|
|
|
|
br label %while.end55
|
|
|
|
|
|
|
|
while.end55: ; preds = %while.end55.loopexit, %if.end
|
|
|
|
%pTempDest.0.lcssa = phi half* [ %35, %if.end ], [ %scevgep156, %while.end55.loopexit ]
|
|
|
|
%pTempSrc.1.lcssa = phi half* [ %arrayidx45, %if.end ], [ %scevgep, %while.end55.loopexit ]
|
|
|
|
%and56 = and i32 %conv, 3
|
|
|
|
%cmp57 = icmp eq i32 %and56, 0
|
|
|
|
br i1 %cmp57, label %if.end61, label %if.then59
|
|
|
|
|
|
|
|
if.then59: ; preds = %while.end55
|
|
|
|
%41 = tail call <8 x i1> @llvm.arm.mve.vctp16(i32 %and56)
|
|
|
|
%42 = bitcast half* %pTempSrc.1.lcssa to <8 x half>*
|
|
|
|
%43 = load <8 x half>, <8 x half>* %42, align 4
|
|
|
|
%44 = bitcast half* %pTempDest.0.lcssa to <8 x half>*
|
|
|
|
tail call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %43, <8 x half>* %44, i32 4, <8 x i1> %41)
|
|
|
|
br label %if.end61
|
|
|
|
|
|
|
|
if.end61: ; preds = %while.end55, %if.then59
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
define void @fir(%struct.arm_fir_instance_f32* nocapture readonly %S, half* nocapture readonly %pSrc, half* nocapture %pDst, i32 %blockSize) {
|
|
|
|
; CHECK-LABEL: fir:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
|
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
|
2020-07-03 15:18:32 +02:00
|
|
|
; CHECK-NEXT: .pad #24
|
|
|
|
; CHECK-NEXT: sub sp, #24
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: cmp r3, #8
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: blo.w .LBB16_12
|
2021-01-22 20:18:34 +01:00
|
|
|
; CHECK-NEXT: @ %bb.1: @ %entry
|
|
|
|
; CHECK-NEXT: lsrs.w r12, r3, #2
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: beq.w .LBB16_12
|
|
|
|
; CHECK-NEXT: @ %bb.2: @ %while.body.lr.ph
|
2020-03-24 11:14:52 +01:00
|
|
|
; CHECK-NEXT: ldrh r4, [r0]
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: movs r1, #1
|
2021-01-22 20:18:34 +01:00
|
|
|
; CHECK-NEXT: ldrd r5, r3, [r0, #4]
|
2020-03-24 11:14:52 +01:00
|
|
|
; CHECK-NEXT: sub.w r0, r4, #8
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: add.w r7, r0, r0, lsr #29
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: and r0, r0, #7
|
|
|
|
; CHECK-NEXT: asrs r6, r7, #3
|
|
|
|
; CHECK-NEXT: cmp r6, #1
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: it gt
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: asrgt r1, r7, #3
|
2020-03-24 11:14:52 +01:00
|
|
|
; CHECK-NEXT: add.w r7, r5, r4, lsl #1
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: subs r1, r7, #2
|
2021-01-22 20:18:34 +01:00
|
|
|
; CHECK-NEXT: rsbs r7, r4, #0
|
|
|
|
; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: add.w r7, r3, #16
|
2020-07-03 15:18:32 +02:00
|
|
|
; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill
|
2021-01-22 20:18:34 +01:00
|
|
|
; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill
|
2021-05-05 21:20:46 +02:00
|
|
|
; CHECK-NEXT: b .LBB16_5
|
|
|
|
; CHECK-NEXT: .LBB16_3: @ %for.end
|
|
|
|
; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
|
|
|
|
; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: wls lr, r0, .LBB16_4
|
|
|
|
; CHECK-NEXT: b .LBB16_9
|
|
|
|
; CHECK-NEXT: .LBB16_4: @ %while.end
|
|
|
|
; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
|
|
|
|
; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: subs.w r12, r12, #1
|
|
|
|
; CHECK-NEXT: vstrb.8 q0, [r2], #8
|
|
|
|
; CHECK-NEXT: add.w r0, r5, r0, lsl #1
|
|
|
|
; CHECK-NEXT: add.w r5, r0, #8
|
|
|
|
; CHECK-NEXT: beq.w .LBB16_12
|
|
|
|
; CHECK-NEXT: .LBB16_5: @ %while.body
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
2021-05-05 21:20:46 +02:00
|
|
|
; CHECK-NEXT: @ Child Loop BB16_7 Depth 2
|
2021-01-18 18:16:07 +01:00
|
|
|
; CHECK-NEXT: @ Child Loop BB16_10 Depth 2
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
|
2021-01-22 20:18:34 +01:00
|
|
|
; CHECK-NEXT: ldrh.w lr, [r3, #14]
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r0], #8
|
|
|
|
; CHECK-NEXT: ldrh.w r8, [r3, #12]
|
|
|
|
; CHECK-NEXT: ldrh r7, [r3, #10]
|
|
|
|
; CHECK-NEXT: ldrh r4, [r3, #8]
|
2021-01-22 20:18:34 +01:00
|
|
|
; CHECK-NEXT: ldrh r6, [r3, #6]
|
|
|
|
; CHECK-NEXT: ldrh.w r9, [r3, #4]
|
|
|
|
; CHECK-NEXT: ldrh.w r11, [r3, #2]
|
|
|
|
; CHECK-NEXT: ldrh.w r10, [r3]
|
2020-03-24 11:14:52 +01:00
|
|
|
; CHECK-NEXT: vstrb.8 q0, [r1], #8
|
2020-03-20 10:23:57 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q0, [r5]
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
|
|
|
|
; CHECK-NEXT: adds r0, r5, #2
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0]
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 16:57:58 +01:00
|
|
|
; CHECK-NEXT: vmul.f16 q0, q0, r10
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: adds r0, r5, #6
|
2020-07-03 15:18:32 +02:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r11
|
2020-03-24 11:14:52 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5, #4]
|
2021-01-22 20:18:34 +01:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r9
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0]
|
|
|
|
; CHECK-NEXT: add.w r0, r5, #10
|
2021-01-22 20:18:34 +01:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r6
|
2020-03-24 11:14:52 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5, #8]
|
2020-07-03 15:18:32 +02:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r4
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0]
|
2020-07-03 15:18:32 +02:00
|
|
|
; CHECK-NEXT: add.w r0, r5, #14
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r7
|
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5, #12]
|
|
|
|
; CHECK-NEXT: adds r5, #16
|
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r8
|
2020-03-24 11:14:52 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0]
|
2020-07-03 15:18:32 +02:00
|
|
|
; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
|
2020-03-24 11:14:52 +01:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, lr
|
|
|
|
; CHECK-NEXT: cmp r0, #16
|
2021-05-05 21:20:46 +02:00
|
|
|
; CHECK-NEXT: blo .LBB16_8
|
|
|
|
; CHECK-NEXT: @ %bb.6: @ %for.body.preheader
|
|
|
|
; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
|
[ARM] Alter t2DoLoopStart to define lr
This changes the definition of t2DoLoopStart from
t2DoLoopStart rGPR
to
GPRlr = t2DoLoopStart rGPR
This will hopefully mean that low overhead loops are more tied together,
and we can more reliably generate loops without reverting or being at
the whims of the register allocator.
This is a fairly simple change in itself, but leads to a number of other
required alterations.
- The hardware loop pass, if UsePhi is set, now generates loops of the
form:
%start = llvm.start.loop.iterations(%N)
loop:
%p = phi [%start], [%dec]
%dec = llvm.loop.decrement.reg(%p, 1)
%c = icmp ne %dec, 0
br %c, loop, exit
- For this a new llvm.start.loop.iterations intrinsic was added, identical
to llvm.set.loop.iterations but produces a value as seen above, gluing
the loop together more through def-use chains.
- This new instrinsic conceptually produces the same output as input,
which is taught to SCEV so that the checks in MVETailPredication are not
affected.
- Some minor changes are needed to the ARMLowOverheadLoop pass, but it has
been left mostly as before. We should now more reliably be able to tell
that the t2DoLoopStart is correct without having to prove it, but
t2WhileLoopStart and tail-predicated loops will remain the same.
- And all the tests have been updated. There are a lot of them!
This patch on it's own might cause more trouble that it helps, with more
tail-predicated loops being reverted, but some additional patches can
hopefully improve upon that to get to something that is better overall.
Differential Revision: https://reviews.llvm.org/D89881
2020-11-10 16:57:58 +01:00
|
|
|
; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: dls lr, r0
|
2020-10-20 09:55:21 +02:00
|
|
|
; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
|
2021-05-05 21:20:46 +02:00
|
|
|
; CHECK-NEXT: .LBB16_7: @ %for.body
|
|
|
|
; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
2020-08-01 15:01:18 +02:00
|
|
|
; CHECK-NEXT: ldrh r0, [r6], #16
|
2020-07-03 15:18:32 +02:00
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5]
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: adds r4, r5, #2
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r0
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r4]
|
2020-08-01 15:01:18 +02:00
|
|
|
; CHECK-NEXT: ldrh r0, [r6, #-14]
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: adds r4, r5, #6
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r0
|
2020-08-01 15:01:18 +02:00
|
|
|
; CHECK-NEXT: ldrh r0, [r6, #-12]
|
2020-07-03 15:18:32 +02:00
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5, #4]
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r0
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r4]
|
2020-08-01 15:01:18 +02:00
|
|
|
; CHECK-NEXT: ldrh r0, [r6, #-10]
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: add.w r4, r5, #10
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r0
|
2020-08-01 15:01:18 +02:00
|
|
|
; CHECK-NEXT: ldrh r0, [r6, #-8]
|
2020-07-03 15:18:32 +02:00
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5, #8]
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r0
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r4]
|
2020-08-01 15:01:18 +02:00
|
|
|
; CHECK-NEXT: ldrh r0, [r6, #-6]
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: ldrh r4, [r6, #-2]
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r0
|
2020-08-01 15:01:18 +02:00
|
|
|
; CHECK-NEXT: ldrh r0, [r6, #-4]
|
2020-07-03 15:18:32 +02:00
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r5, #12]
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r0
|
2020-07-03 15:18:32 +02:00
|
|
|
; CHECK-NEXT: add.w r0, r5, #14
|
2020-03-24 11:14:52 +01:00
|
|
|
; CHECK-NEXT: vldrw.u32 q1, [r0]
|
2020-07-03 15:18:32 +02:00
|
|
|
; CHECK-NEXT: adds r5, #16
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r4
|
2021-05-05 21:20:46 +02:00
|
|
|
; CHECK-NEXT: le lr, .LBB16_7
|
|
|
|
; CHECK-NEXT: b .LBB16_3
|
|
|
|
; CHECK-NEXT: .LBB16_8: @ in Loop: Header=BB16_5 Depth=1
|
2021-01-18 18:16:07 +01:00
|
|
|
; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
|
2021-04-12 15:46:23 +02:00
|
|
|
; CHECK-NEXT: b .LBB16_3
|
2021-01-18 18:16:07 +01:00
|
|
|
; CHECK-NEXT: .LBB16_9: @ %while.body76.preheader
|
2021-05-05 21:20:46 +02:00
|
|
|
; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
|
2020-07-03 15:18:32 +02:00
|
|
|
; CHECK-NEXT: mov r0, r5
|
2021-01-18 18:16:07 +01:00
|
|
|
; CHECK-NEXT: .LBB16_10: @ %while.body76
|
2021-05-05 21:20:46 +02:00
|
|
|
; CHECK-NEXT: @ Parent Loop BB16_5 Depth=1
|
2021-01-18 18:16:07 +01:00
|
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: ldrh r4, [r6], #2
|
2021-01-18 18:16:07 +01:00
|
|
|
; CHECK-NEXT: vldrh.u16 q1, [r0], #2
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: vfma.f16 q0, q1, r4
|
2021-04-12 15:46:23 +02:00
|
|
|
; CHECK-NEXT: le lr, .LBB16_10
|
|
|
|
; CHECK-NEXT: @ %bb.11: @ %while.end.loopexit
|
2021-05-05 21:20:46 +02:00
|
|
|
; CHECK-NEXT: @ in Loop: Header=BB16_5 Depth=1
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
|
|
|
|
; CHECK-NEXT: add.w r5, r5, r0, lsl #1
|
2021-05-05 21:20:46 +02:00
|
|
|
; CHECK-NEXT: b .LBB16_4
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: .LBB16_12: @ %if.end
|
2020-07-03 15:18:32 +02:00
|
|
|
; CHECK-NEXT: add sp, #24
|
2020-03-20 09:25:19 +01:00
|
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
|
|
|
|
entry:
|
|
|
|
%pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 1
|
|
|
|
%0 = load half*, half** %pState1, align 4
|
|
|
|
%pCoeffs2 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 2
|
|
|
|
%1 = load half*, half** %pCoeffs2, align 4
|
|
|
|
%numTaps3 = getelementptr inbounds %struct.arm_fir_instance_f32, %struct.arm_fir_instance_f32* %S, i32 0, i32 0
|
|
|
|
%2 = load i16, i16* %numTaps3, align 4
|
|
|
|
%conv = zext i16 %2 to i32
|
|
|
|
%cmp = icmp ugt i32 %blockSize, 7
|
|
|
|
br i1 %cmp, label %if.then, label %if.end
|
|
|
|
|
|
|
|
if.then: ; preds = %entry
|
|
|
|
%shr = lshr i32 %blockSize, 2
|
|
|
|
%cmp5217 = icmp eq i32 %shr, 0
|
|
|
|
br i1 %cmp5217, label %if.end, label %while.body.lr.ph
|
|
|
|
|
|
|
|
while.body.lr.ph: ; preds = %if.then
|
|
|
|
%sub = add nsw i32 %conv, -1
|
|
|
|
%arrayidx = getelementptr inbounds half, half* %0, i32 %sub
|
|
|
|
%incdec.ptr = getelementptr inbounds half, half* %1, i32 1
|
|
|
|
%incdec.ptr7 = getelementptr inbounds half, half* %1, i32 2
|
|
|
|
%incdec.ptr8 = getelementptr inbounds half, half* %1, i32 3
|
|
|
|
%incdec.ptr9 = getelementptr inbounds half, half* %1, i32 4
|
|
|
|
%incdec.ptr10 = getelementptr inbounds half, half* %1, i32 5
|
|
|
|
%incdec.ptr11 = getelementptr inbounds half, half* %1, i32 6
|
|
|
|
%incdec.ptr12 = getelementptr inbounds half, half* %1, i32 7
|
|
|
|
%sub37 = add nsw i32 %conv, -8
|
|
|
|
%div = sdiv i32 %sub37, 8
|
|
|
|
%pCoeffsCur.0199 = getelementptr inbounds half, half* %1, i32 8
|
|
|
|
%cmp38201 = icmp ugt i16 %2, 15
|
|
|
|
%and = and i32 %sub37, 7
|
|
|
|
%cmp74210 = icmp eq i32 %and, 0
|
|
|
|
%idx.neg = sub nsw i32 0, %conv
|
|
|
|
%3 = icmp sgt i32 %div, 1
|
|
|
|
%smax = select i1 %3, i32 %div, i32 1
|
|
|
|
br label %while.body
|
|
|
|
|
|
|
|
while.body: ; preds = %while.body.lr.ph, %while.end
|
|
|
|
%blkCnt.0222 = phi i32 [ %shr, %while.body.lr.ph ], [ %dec84, %while.end ]
|
|
|
|
%pStateCur.0221 = phi half* [ %arrayidx, %while.body.lr.ph ], [ %add.ptr, %while.end ]
|
|
|
|
%pSamples.0220 = phi half* [ %0, %while.body.lr.ph ], [ %add.ptr83, %while.end ]
|
|
|
|
%pTempSrc.0219 = phi half* [ %pSrc, %while.body.lr.ph ], [ %add.ptr14, %while.end ]
|
|
|
|
%pOutput.0218 = phi half* [ %pDst, %while.body.lr.ph ], [ %add.ptr81, %while.end ]
|
|
|
|
%4 = load half, half* %1, align 4
|
|
|
|
%5 = load half, half* %incdec.ptr, align 4
|
|
|
|
%6 = load half, half* %incdec.ptr7, align 4
|
|
|
|
%7 = load half, half* %incdec.ptr8, align 4
|
|
|
|
%8 = load half, half* %incdec.ptr9, align 4
|
|
|
|
%9 = load half, half* %incdec.ptr10, align 4
|
|
|
|
%10 = load half, half* %incdec.ptr11, align 4
|
|
|
|
%11 = load half, half* %incdec.ptr12, align 4
|
|
|
|
%12 = bitcast half* %pTempSrc.0219 to <8 x half>*
|
|
|
|
%13 = load <8 x half>, <8 x half>* %12, align 4
|
|
|
|
%14 = bitcast half* %pStateCur.0221 to <8 x half>*
|
|
|
|
store <8 x half> %13, <8 x half>* %14, align 4
|
|
|
|
%add.ptr = getelementptr inbounds half, half* %pStateCur.0221, i32 4
|
|
|
|
%add.ptr14 = getelementptr inbounds half, half* %pTempSrc.0219, i32 4
|
|
|
|
%15 = bitcast half* %pSamples.0220 to <8 x half>*
|
|
|
|
%16 = load <8 x half>, <8 x half>* %15, align 4
|
|
|
|
%.splatinsert = insertelement <8 x half> undef, half %4, i32 0
|
|
|
|
%.splat = shufflevector <8 x half> %.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
%17 = fmul fast <8 x half> %16, %.splat
|
|
|
|
%arrayidx15 = getelementptr inbounds half, half* %pSamples.0220, i32 1
|
|
|
|
%18 = bitcast half* %arrayidx15 to <8 x half>*
|
|
|
|
%19 = load <8 x half>, <8 x half>* %18, align 4
|
|
|
|
%.splatinsert16 = insertelement <8 x half> undef, half %5, i32 0
|
|
|
|
%.splat17 = shufflevector <8 x half> %.splatinsert16, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
%20 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %19, <8 x half> %.splat17, <8 x half> %17)
|
|
|
|
%arrayidx18 = getelementptr inbounds half, half* %pSamples.0220, i32 2
|
|
|
|
%21 = bitcast half* %arrayidx18 to <8 x half>*
|
|
|
|
%22 = load <8 x half>, <8 x half>* %21, align 4
|
|
|
|
%.splatinsert19 = insertelement <8 x half> undef, half %6, i32 0
|
|
|
|
%.splat20 = shufflevector <8 x half> %.splatinsert19, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
%23 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %22, <8 x half> %.splat20, <8 x half> %20)
|
|
|
|
%arrayidx21 = getelementptr inbounds half, half* %pSamples.0220, i32 3
|
|
|
|
%24 = bitcast half* %arrayidx21 to <8 x half>*
|
|
|
|
%25 = load <8 x half>, <8 x half>* %24, align 4
|
|
|
|
%.splatinsert22 = insertelement <8 x half> undef, half %7, i32 0
|
|
|
|
%.splat23 = shufflevector <8 x half> %.splatinsert22, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
%26 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %25, <8 x half> %.splat23, <8 x half> %23)
|
|
|
|
%arrayidx24 = getelementptr inbounds half, half* %pSamples.0220, i32 4
|
|
|
|
%27 = bitcast half* %arrayidx24 to <8 x half>*
|
|
|
|
%28 = load <8 x half>, <8 x half>* %27, align 4
|
|
|
|
%.splatinsert25 = insertelement <8 x half> undef, half %8, i32 0
|
|
|
|
%.splat26 = shufflevector <8 x half> %.splatinsert25, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
%29 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %28, <8 x half> %.splat26, <8 x half> %26)
|
|
|
|
%arrayidx27 = getelementptr inbounds half, half* %pSamples.0220, i32 5
|
|
|
|
%30 = bitcast half* %arrayidx27 to <8 x half>*
|
|
|
|
%31 = load <8 x half>, <8 x half>* %30, align 4
|
|
|
|
%.splatinsert28 = insertelement <8 x half> undef, half %9, i32 0
|
|
|
|
%.splat29 = shufflevector <8 x half> %.splatinsert28, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
%32 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %31, <8 x half> %.splat29, <8 x half> %29)
|
|
|
|
%arrayidx30 = getelementptr inbounds half, half* %pSamples.0220, i32 6
|
|
|
|
%33 = bitcast half* %arrayidx30 to <8 x half>*
|
|
|
|
%34 = load <8 x half>, <8 x half>* %33, align 4
|
|
|
|
%.splatinsert31 = insertelement <8 x half> undef, half %10, i32 0
|
|
|
|
%.splat32 = shufflevector <8 x half> %.splatinsert31, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
%35 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %34, <8 x half> %.splat32, <8 x half> %32)
|
|
|
|
%arrayidx33 = getelementptr inbounds half, half* %pSamples.0220, i32 7
|
|
|
|
%36 = bitcast half* %arrayidx33 to <8 x half>*
|
|
|
|
%37 = load <8 x half>, <8 x half>* %36, align 4
|
|
|
|
%.splatinsert34 = insertelement <8 x half> undef, half %11, i32 0
|
|
|
|
%.splat35 = shufflevector <8 x half> %.splatinsert34, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
%38 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %37, <8 x half> %.splat35, <8 x half> %35)
|
|
|
|
%pSamples.1200 = getelementptr inbounds half, half* %pSamples.0220, i32 8
|
|
|
|
br i1 %cmp38201, label %for.body, label %for.end
|
|
|
|
|
|
|
|
for.body: ; preds = %while.body, %for.body
|
|
|
|
%pSamples.1207 = phi half* [ %pSamples.1, %for.body ], [ %pSamples.1200, %while.body ]
|
|
|
|
%pCoeffsCur.0206 = phi half* [ %pCoeffsCur.0, %for.body ], [ %pCoeffsCur.0199, %while.body ]
|
|
|
|
%.pn205 = phi half* [ %pCoeffsCur.0206, %for.body ], [ %1, %while.body ]
|
|
|
|
%i.0204 = phi i32 [ %inc, %for.body ], [ 0, %while.body ]
|
|
|
|
%vecAcc0.0203 = phi <8 x half> [ %70, %for.body ], [ %38, %while.body ]
|
|
|
|
%pSamples.0.pn202 = phi half* [ %pSamples.1207, %for.body ], [ %pSamples.0220, %while.body ]
|
|
|
|
%incdec.ptr40 = getelementptr inbounds half, half* %.pn205, i32 9
|
|
|
|
%39 = load half, half* %pCoeffsCur.0206, align 4
|
|
|
|
%incdec.ptr41 = getelementptr inbounds half, half* %.pn205, i32 10
|
|
|
|
%40 = load half, half* %incdec.ptr40, align 4
|
|
|
|
%incdec.ptr42 = getelementptr inbounds half, half* %.pn205, i32 11
|
|
|
|
%41 = load half, half* %incdec.ptr41, align 4
|
|
|
|
%incdec.ptr43 = getelementptr inbounds half, half* %.pn205, i32 12
|
|
|
|
%42 = load half, half* %incdec.ptr42, align 4
|
|
|
|
%incdec.ptr44 = getelementptr inbounds half, half* %.pn205, i32 13
|
|
|
|
%43 = load half, half* %incdec.ptr43, align 4
|
|
|
|
%incdec.ptr45 = getelementptr inbounds half, half* %.pn205, i32 14
|
|
|
|
%44 = load half, half* %incdec.ptr44, align 4
|
|
|
|
%incdec.ptr46 = getelementptr inbounds half, half* %.pn205, i32 15
|
|
|
|
%45 = load half, half* %incdec.ptr45, align 4
|
|
|
|
%46 = load half, half* %incdec.ptr46, align 4
|
|
|
|
%47 = bitcast half* %pSamples.1207 to <8 x half>*
|
|
|
|
%48 = load <8 x half>, <8 x half>* %47, align 4
|
|
|
|
%.splatinsert48 = insertelement <8 x half> undef, half %39, i32 0
|
|
|
|
%.splat49 = shufflevector <8 x half> %.splatinsert48, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
%49 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %48, <8 x half> %.splat49, <8 x half> %vecAcc0.0203)
|
|
|
|
%arrayidx50 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 9
|
|
|
|
%50 = bitcast half* %arrayidx50 to <8 x half>*
|
|
|
|
%51 = load <8 x half>, <8 x half>* %50, align 4
|
|
|
|
%.splatinsert51 = insertelement <8 x half> undef, half %40, i32 0
|
|
|
|
%.splat52 = shufflevector <8 x half> %.splatinsert51, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
%52 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %51, <8 x half> %.splat52, <8 x half> %49)
|
|
|
|
%arrayidx53 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 10
|
|
|
|
%53 = bitcast half* %arrayidx53 to <8 x half>*
|
|
|
|
%54 = load <8 x half>, <8 x half>* %53, align 4
|
|
|
|
%.splatinsert54 = insertelement <8 x half> undef, half %41, i32 0
|
|
|
|
%.splat55 = shufflevector <8 x half> %.splatinsert54, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
%55 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %54, <8 x half> %.splat55, <8 x half> %52)
|
|
|
|
%arrayidx56 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 11
|
|
|
|
%56 = bitcast half* %arrayidx56 to <8 x half>*
|
|
|
|
%57 = load <8 x half>, <8 x half>* %56, align 4
|
|
|
|
%.splatinsert57 = insertelement <8 x half> undef, half %42, i32 0
|
|
|
|
%.splat58 = shufflevector <8 x half> %.splatinsert57, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
%58 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %57, <8 x half> %.splat58, <8 x half> %55)
|
|
|
|
%arrayidx59 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 12
|
|
|
|
%59 = bitcast half* %arrayidx59 to <8 x half>*
|
|
|
|
%60 = load <8 x half>, <8 x half>* %59, align 4
|
|
|
|
%.splatinsert60 = insertelement <8 x half> undef, half %43, i32 0
|
|
|
|
%.splat61 = shufflevector <8 x half> %.splatinsert60, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
%61 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %60, <8 x half> %.splat61, <8 x half> %58)
|
|
|
|
%arrayidx62 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 13
|
|
|
|
%62 = bitcast half* %arrayidx62 to <8 x half>*
|
|
|
|
%63 = load <8 x half>, <8 x half>* %62, align 4
|
|
|
|
%.splatinsert63 = insertelement <8 x half> undef, half %44, i32 0
|
|
|
|
%.splat64 = shufflevector <8 x half> %.splatinsert63, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
%64 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %63, <8 x half> %.splat64, <8 x half> %61)
|
|
|
|
%arrayidx65 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 14
|
|
|
|
%65 = bitcast half* %arrayidx65 to <8 x half>*
|
|
|
|
%66 = load <8 x half>, <8 x half>* %65, align 4
|
|
|
|
%.splatinsert66 = insertelement <8 x half> undef, half %45, i32 0
|
|
|
|
%.splat67 = shufflevector <8 x half> %.splatinsert66, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
%67 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %66, <8 x half> %.splat67, <8 x half> %64)
|
|
|
|
%arrayidx68 = getelementptr inbounds half, half* %pSamples.0.pn202, i32 15
|
|
|
|
%68 = bitcast half* %arrayidx68 to <8 x half>*
|
|
|
|
%69 = load <8 x half>, <8 x half>* %68, align 4
|
|
|
|
%.splatinsert69 = insertelement <8 x half> undef, half %46, i32 0
|
|
|
|
%.splat70 = shufflevector <8 x half> %.splatinsert69, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
%70 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %69, <8 x half> %.splat70, <8 x half> %67)
|
|
|
|
%inc = add nuw nsw i32 %i.0204, 1
|
|
|
|
%pCoeffsCur.0 = getelementptr inbounds half, half* %pCoeffsCur.0206, i32 8
|
|
|
|
%pSamples.1 = getelementptr inbounds half, half* %pSamples.1207, i32 8
|
|
|
|
%exitcond = icmp eq i32 %inc, %smax
|
|
|
|
br i1 %exitcond, label %for.end, label %for.body
|
|
|
|
|
|
|
|
for.end: ; preds = %for.body, %while.body
|
|
|
|
%vecAcc0.0.lcssa = phi <8 x half> [ %38, %while.body ], [ %70, %for.body ]
|
|
|
|
%pCoeffsCur.0.lcssa = phi half* [ %pCoeffsCur.0199, %while.body ], [ %pCoeffsCur.0, %for.body ]
|
|
|
|
%pSamples.1.lcssa = phi half* [ %pSamples.1200, %while.body ], [ %pSamples.1, %for.body ]
|
|
|
|
br i1 %cmp74210, label %while.end, label %while.body76
|
|
|
|
|
|
|
|
while.body76: ; preds = %for.end, %while.body76
|
|
|
|
%pCoeffsCur.1214 = phi half* [ %incdec.ptr77, %while.body76 ], [ %pCoeffsCur.0.lcssa, %for.end ]
|
|
|
|
%vecAcc0.1213 = phi <8 x half> [ %74, %while.body76 ], [ %vecAcc0.0.lcssa, %for.end ]
|
|
|
|
%numCnt.0212 = phi i32 [ %dec, %while.body76 ], [ %and, %for.end ]
|
|
|
|
%pSamples.2211 = phi half* [ %incdec.ptr80, %while.body76 ], [ %pSamples.1.lcssa, %for.end ]
|
|
|
|
%incdec.ptr77 = getelementptr inbounds half, half* %pCoeffsCur.1214, i32 1
|
|
|
|
%71 = load half, half* %pCoeffsCur.1214, align 4
|
|
|
|
%72 = bitcast half* %pSamples.2211 to <8 x half>*
|
|
|
|
%73 = load <8 x half>, <8 x half>* %72, align 4
|
|
|
|
%.splatinsert78 = insertelement <8 x half> undef, half %71, i32 0
|
|
|
|
%.splat79 = shufflevector <8 x half> %.splatinsert78, <8 x half> undef, <8 x i32> zeroinitializer
|
|
|
|
%74 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %73, <8 x half> %.splat79, <8 x half> %vecAcc0.1213)
|
|
|
|
%incdec.ptr80 = getelementptr inbounds half, half* %pSamples.2211, i32 1
|
|
|
|
%dec = add nsw i32 %numCnt.0212, -1
|
|
|
|
%cmp74 = icmp sgt i32 %numCnt.0212, 1
|
|
|
|
br i1 %cmp74, label %while.body76, label %while.end.loopexit
|
|
|
|
|
|
|
|
while.end.loopexit: ; preds = %while.body76
|
|
|
|
%scevgep = getelementptr half, half* %pSamples.1.lcssa, i32 %and
|
|
|
|
br label %while.end
|
|
|
|
|
|
|
|
while.end: ; preds = %while.end.loopexit, %for.end
|
|
|
|
%pSamples.2.lcssa = phi half* [ %pSamples.1.lcssa, %for.end ], [ %scevgep, %while.end.loopexit ]
|
|
|
|
%vecAcc0.1.lcssa = phi <8 x half> [ %vecAcc0.0.lcssa, %for.end ], [ %74, %while.end.loopexit ]
|
|
|
|
%75 = bitcast half* %pOutput.0218 to <8 x half>*
|
|
|
|
store <8 x half> %vecAcc0.1.lcssa, <8 x half>* %75, align 4
|
|
|
|
%add.ptr81 = getelementptr inbounds half, half* %pOutput.0218, i32 4
|
|
|
|
%add.ptr82 = getelementptr inbounds half, half* %pSamples.2.lcssa, i32 4
|
|
|
|
%add.ptr83 = getelementptr inbounds half, half* %add.ptr82, i32 %idx.neg
|
|
|
|
%dec84 = add nsw i32 %blkCnt.0222, -1
|
|
|
|
%cmp5 = icmp eq i32 %dec84, 0
|
|
|
|
br i1 %cmp5, label %if.end, label %while.body
|
|
|
|
|
|
|
|
if.end: ; preds = %while.end, %if.then, %entry
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2021-02-08 11:50:23 +01:00
|
|
|
%struct.arm_biquad_cascade_df2T_instance_f16 = type { i8, half*, half* }
|
|
|
|
define void @arm_biquad_cascade_df2T_f16(%struct.arm_biquad_cascade_df2T_instance_f16* nocapture readonly %S, half* nocapture readonly %pSrc, half* nocapture %pDst, i32 %blockSize) {
|
|
|
|
; CHECK-LABEL: arm_biquad_cascade_df2T_f16:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
2021-02-12 19:34:58 +01:00
|
|
|
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
|
|
|
|
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
|
|
|
|
; CHECK-NEXT: .pad #4
|
|
|
|
; CHECK-NEXT: sub sp, #4
|
|
|
|
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
|
|
|
|
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
|
2021-02-08 11:50:23 +01:00
|
|
|
; CHECK-NEXT: ldrd r12, r6, [r0, #4]
|
|
|
|
; CHECK-NEXT: and r8, r3, #1
|
|
|
|
; CHECK-NEXT: ldrb r0, [r0]
|
|
|
|
; CHECK-NEXT: vldr.16 s4, .LCPI17_0
|
2021-02-12 19:34:58 +01:00
|
|
|
; CHECK-NEXT: lsr.w r9, r3, #1
|
2021-02-08 11:50:23 +01:00
|
|
|
; CHECK-NEXT: vmov.i32 q0, #0x0
|
|
|
|
; CHECK-NEXT: b .LBB17_3
|
|
|
|
; CHECK-NEXT: .LBB17_1: @ %if.else
|
|
|
|
; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1
|
2021-02-12 19:34:58 +01:00
|
|
|
; CHECK-NEXT: vstr.16 s8, [r12]
|
|
|
|
; CHECK-NEXT: vmovx.f16 s9, s8
|
2021-02-08 11:50:23 +01:00
|
|
|
; CHECK-NEXT: .LBB17_2: @ %if.end
|
|
|
|
; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1
|
2021-02-12 19:34:58 +01:00
|
|
|
; CHECK-NEXT: vstr.16 s9, [r12, #2]
|
2021-02-08 11:50:23 +01:00
|
|
|
; CHECK-NEXT: adds r6, #10
|
|
|
|
; CHECK-NEXT: subs r0, #1
|
|
|
|
; CHECK-NEXT: add.w r12, r12, #4
|
|
|
|
; CHECK-NEXT: mov r1, r2
|
|
|
|
; CHECK-NEXT: beq .LBB17_8
|
|
|
|
; CHECK-NEXT: .LBB17_3: @ %do.body
|
|
|
|
; CHECK-NEXT: @ =>This Loop Header: Depth=1
|
|
|
|
; CHECK-NEXT: @ Child Loop BB17_5 Depth 2
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: vldrh.u16 q3, [r6]
|
2021-02-08 11:50:23 +01:00
|
|
|
; CHECK-NEXT: movs r5, #0
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: vmov q5, q3
|
2021-02-08 11:50:23 +01:00
|
|
|
; CHECK-NEXT: vshlc q5, r5, #16
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: vldrh.u16 q4, [r6, #4]
|
|
|
|
; CHECK-NEXT: vmov q6, q4
|
2021-02-08 11:50:23 +01:00
|
|
|
; CHECK-NEXT: vshlc q6, r5, #16
|
2021-02-12 19:34:58 +01:00
|
|
|
; CHECK-NEXT: vldrh.u16 q2, [r12]
|
|
|
|
; CHECK-NEXT: vmov.f32 s9, s1
|
2021-02-08 11:50:23 +01:00
|
|
|
; CHECK-NEXT: mov r5, r2
|
2021-02-12 19:34:58 +01:00
|
|
|
; CHECK-NEXT: wls lr, r9, .LBB17_6
|
2021-02-08 11:50:23 +01:00
|
|
|
; CHECK-NEXT: @ %bb.4: @ %while.body.preheader
|
|
|
|
; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1
|
|
|
|
; CHECK-NEXT: mov r5, r2
|
|
|
|
; CHECK-NEXT: .LBB17_5: @ %while.body
|
|
|
|
; CHECK-NEXT: @ Parent Loop BB17_3 Depth=1
|
|
|
|
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
|
|
|
|
; CHECK-NEXT: ldrh r7, [r1], #4
|
2021-03-21 13:00:06 +01:00
|
|
|
; CHECK-NEXT: vmov r3, s4
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: vfma.f16 q2, q3, r7
|
2021-03-21 13:00:06 +01:00
|
|
|
; CHECK-NEXT: ldrh r4, [r1, #-2]
|
2021-02-12 19:34:58 +01:00
|
|
|
; CHECK-NEXT: vmov.u16 r7, q2[0]
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: vfma.f16 q2, q4, r7
|
2021-03-21 13:00:06 +01:00
|
|
|
; CHECK-NEXT: vins.f16 s9, s4
|
|
|
|
; CHECK-NEXT: vfma.f16 q2, q5, r4
|
|
|
|
; CHECK-NEXT: vmov.u16 r4, q2[1]
|
|
|
|
; CHECK-NEXT: vfma.f16 q2, q6, r4
|
|
|
|
; CHECK-NEXT: strh r4, [r5, #2]
|
2021-02-12 19:34:58 +01:00
|
|
|
; CHECK-NEXT: vmov.f32 s8, s9
|
|
|
|
; CHECK-NEXT: strh r7, [r5], #4
|
2021-03-21 13:00:06 +01:00
|
|
|
; CHECK-NEXT: vmov.16 q2[2], r3
|
2021-02-08 11:50:23 +01:00
|
|
|
; CHECK-NEXT: le lr, .LBB17_5
|
|
|
|
; CHECK-NEXT: .LBB17_6: @ %while.end
|
|
|
|
; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1
|
|
|
|
; CHECK-NEXT: cmp.w r8, #0
|
|
|
|
; CHECK-NEXT: beq .LBB17_1
|
|
|
|
; CHECK-NEXT: @ %bb.7: @ %if.then
|
|
|
|
; CHECK-NEXT: @ in Loop: Header=BB17_3 Depth=1
|
|
|
|
; CHECK-NEXT: ldrh r1, [r1]
|
2021-02-12 19:34:58 +01:00
|
|
|
; CHECK-NEXT: vfma.f16 q2, q3, r1
|
[ARM] Improve WLS lowering
Recently we improved the lowering of low overhead loops and tail
predicated loops, but concentrated first on the DLS do style loops. This
extends those improvements over to the WLS while loops, improving the
chance of lowering them successfully. To do this the lowering has to
change a little as the instructions are terminators that produce a value
- something that needs to be treated carefully.
Lowering starts at the Hardware Loop pass, inserting a new
llvm.test.start.loop.iterations that produces both an i1 to control the
loop entry and an i32 similar to the llvm.start.loop.iterations
intrinsic added for do loops. This feeds into the loop phi, properly
gluing the values together:
%wls = call { i32, i1 } @llvm.test.start.loop.iterations.i32(i32 %div)
%wls0 = extractvalue { i32, i1 } %wls, 0
%wls1 = extractvalue { i32, i1 } %wls, 1
br i1 %wls1, label %loop.ph, label %loop.exit
...
loop:
%lsr.iv = phi i32 [ %wls0, %loop.ph ], [ %iv.next, %loop ]
..
%iv.next = call i32 @llvm.loop.decrement.reg.i32(i32 %lsr.iv, i32 1)
%cmp = icmp ne i32 %iv.next, 0
br i1 %cmp, label %loop, label %loop.exit
The llvm.test.start.loop.iterations need to be lowered through ISel
lowering as a pair of WLS and WLSSETUP nodes, which each get converted
to t2WhileLoopSetup and t2WhileLoopStart Pseudos. This helps prevent
t2WhileLoopStart from being a terminator that produces a value,
something difficult to control at that stage in the pipeline. Instead
the t2WhileLoopSetup produces the value of LR (essentially acting as a
lr = subs rn, 0), t2WhileLoopStart consumes that lr value (the Bcc).
These are then converted into a single t2WhileLoopStartLR at the same
point as t2DoLoopStartTP and t2LoopEndDec. Otherwise we revert the loop
to prevent them from progressing further in the pipeline. The
t2WhileLoopStartLR is a single instruction that takes a GPR and produces
LR, similar to the WLS instruction.
%1:gprlr = t2WhileLoopStartLR %0:rgpr, %bb.3
t2B %bb.1
...
bb.2.loop:
%2:gprlr = PHI %1:gprlr, %bb.1, %3:gprlr, %bb.2
...
%3:gprlr = t2LoopEndDec %2:gprlr, %bb.2
t2B %bb.3
The t2WhileLoopStartLR can then be treated similar to the other low
overhead loop pseudos, eventually being lowered to a WLS providing the
branches are within range.
Differential Revision: https://reviews.llvm.org/D97729
2021-03-11 15:06:04 +01:00
|
|
|
; CHECK-NEXT: vmov.u16 r1, q2[0]
|
|
|
|
; CHECK-NEXT: vfma.f16 q2, q4, r1
|
2021-02-12 19:34:58 +01:00
|
|
|
; CHECK-NEXT: strh r1, [r5]
|
|
|
|
; CHECK-NEXT: vmovx.f16 s6, s8
|
2021-02-08 11:50:23 +01:00
|
|
|
; CHECK-NEXT: vstr.16 s6, [r12]
|
|
|
|
; CHECK-NEXT: b .LBB17_2
|
|
|
|
; CHECK-NEXT: .LBB17_8: @ %do.end
|
2021-02-12 19:34:58 +01:00
|
|
|
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
|
|
|
|
; CHECK-NEXT: add sp, #4
|
|
|
|
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
|
2021-02-08 11:50:23 +01:00
|
|
|
; CHECK-NEXT: .p2align 1
|
|
|
|
; CHECK-NEXT: @ %bb.9:
|
|
|
|
; CHECK-NEXT: .LCPI17_0:
|
|
|
|
; CHECK-NEXT: .short 0x0000 @ half 0
|
|
|
|
entry:
|
|
|
|
%pState1 = getelementptr inbounds %struct.arm_biquad_cascade_df2T_instance_f16, %struct.arm_biquad_cascade_df2T_instance_f16* %S, i32 0, i32 1
|
|
|
|
%0 = load half*, half** %pState1, align 4
|
|
|
|
%numStages = getelementptr inbounds %struct.arm_biquad_cascade_df2T_instance_f16, %struct.arm_biquad_cascade_df2T_instance_f16* %S, i32 0, i32 0
|
|
|
|
%1 = load i8, i8* %numStages, align 4
|
|
|
|
%conv = zext i8 %1 to i32
|
|
|
|
%pCoeffs = getelementptr inbounds %struct.arm_biquad_cascade_df2T_instance_f16, %struct.arm_biquad_cascade_df2T_instance_f16* %S, i32 0, i32 2
|
|
|
|
%2 = load half*, half** %pCoeffs, align 4
|
|
|
|
%div = lshr i32 %blockSize, 1
|
|
|
|
%cmp.not90 = icmp eq i32 %div, 0
|
|
|
|
%and = and i32 %blockSize, 1
|
|
|
|
%tobool.not = icmp eq i32 %and, 0
|
|
|
|
br label %do.body
|
|
|
|
|
|
|
|
do.body: ; preds = %if.end, %entry
|
|
|
|
%stage.0 = phi i32 [ %conv, %entry ], [ %dec23, %if.end ]
|
|
|
|
%pCurCoeffs.0 = phi half* [ %2, %entry ], [ %add.ptr2, %if.end ]
|
|
|
|
%pState.0 = phi half* [ %0, %entry ], [ %pState.1, %if.end ]
|
|
|
|
%pIn.0 = phi half* [ %pSrc, %entry ], [ %pDst, %if.end ]
|
|
|
|
%3 = bitcast half* %pCurCoeffs.0 to <8 x half>*
|
|
|
|
%4 = load <8 x half>, <8 x half>* %3, align 2
|
|
|
|
%add.ptr = getelementptr inbounds half, half* %pCurCoeffs.0, i32 2
|
|
|
|
%5 = bitcast half* %add.ptr to <8 x half>*
|
|
|
|
%6 = load <8 x half>, <8 x half>* %5, align 2
|
|
|
|
%add.ptr2 = getelementptr inbounds half, half* %pCurCoeffs.0, i32 5
|
|
|
|
%7 = bitcast half* %pState.0 to <8 x half>*
|
|
|
|
%8 = load <8 x half>, <8 x half>* %7, align 2
|
|
|
|
%9 = shufflevector <8 x half> %8, <8 x half> <half poison, half poison, half 0xH0000, half 0xH0000, half poison, half poison, half poison, half poison>, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
|
|
|
|
%10 = bitcast <8 x half> %4 to <8 x i16>
|
|
|
|
%11 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16> %10, i32 0, i32 16)
|
|
|
|
%12 = extractvalue { i32, <8 x i16> } %11, 0
|
|
|
|
%13 = extractvalue { i32, <8 x i16> } %11, 1
|
|
|
|
%14 = bitcast <8 x i16> %13 to <8 x half>
|
|
|
|
%15 = bitcast <8 x half> %6 to <8 x i16>
|
|
|
|
%16 = tail call { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16> %15, i32 %12, i32 16)
|
|
|
|
%17 = extractvalue { i32, <8 x i16> } %16, 1
|
|
|
|
%18 = bitcast <8 x i16> %17 to <8 x half>
|
|
|
|
br i1 %cmp.not90, label %while.end, label %while.body
|
|
|
|
|
|
|
|
while.body: ; preds = %do.body, %while.body
|
|
|
|
%pIn.194 = phi half* [ %incdec.ptr4, %while.body ], [ %pIn.0, %do.body ]
|
|
|
|
%state.093 = phi <8 x half> [ %30, %while.body ], [ %9, %do.body ]
|
|
|
|
%pOut.192 = phi half* [ %incdec.ptr12, %while.body ], [ %pDst, %do.body ]
|
|
|
|
%sample.091 = phi i32 [ %dec, %while.body ], [ %div, %do.body ]
|
|
|
|
%incdec.ptr = getelementptr inbounds half, half* %pIn.194, i32 1
|
|
|
|
%19 = load half, half* %pIn.194, align 2
|
|
|
|
%incdec.ptr4 = getelementptr inbounds half, half* %pIn.194, i32 2
|
|
|
|
%20 = load half, half* %incdec.ptr, align 2
|
|
|
|
%.splatinsert = insertelement <8 x half> poison, half %19, i32 0
|
|
|
|
%.splat = shufflevector <8 x half> %.splatinsert, <8 x half> poison, <8 x i32> zeroinitializer
|
|
|
|
%21 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %4, <8 x half> %.splat, <8 x half> %state.093)
|
|
|
|
%22 = extractelement <8 x half> %21, i32 0
|
|
|
|
%.splat6 = shufflevector <8 x half> %21, <8 x half> poison, <8 x i32> zeroinitializer
|
|
|
|
%23 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %6, <8 x half> %.splat6, <8 x half> %21)
|
|
|
|
%24 = insertelement <8 x half> %23, half 0xH0000, i32 3
|
|
|
|
%.splatinsert7 = insertelement <8 x half> poison, half %20, i32 0
|
|
|
|
%.splat8 = shufflevector <8 x half> %.splatinsert7, <8 x half> poison, <8 x i32> zeroinitializer
|
|
|
|
%25 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %14, <8 x half> %.splat8, <8 x half> %24)
|
|
|
|
%26 = extractelement <8 x half> %25, i32 1
|
|
|
|
%.splat10 = shufflevector <8 x half> %25, <8 x half> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
|
|
|
|
%27 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %18, <8 x half> %.splat10, <8 x half> %25)
|
|
|
|
%28 = shufflevector <8 x half> %27, <8 x half> undef, <8 x i32> <i32 2, i32 undef, i32 undef, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
|
|
%29 = insertelement <8 x half> %28, half 0xH0000, i32 2
|
|
|
|
%30 = shufflevector <8 x half> %29, <8 x half> %27, <8 x i32> <i32 0, i32 11, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
|
|
|
|
%incdec.ptr11 = getelementptr inbounds half, half* %pOut.192, i32 1
|
|
|
|
store half %22, half* %pOut.192, align 2
|
|
|
|
%incdec.ptr12 = getelementptr inbounds half, half* %pOut.192, i32 2
|
|
|
|
store half %26, half* %incdec.ptr11, align 2
|
|
|
|
%dec = add nsw i32 %sample.091, -1
|
|
|
|
%cmp.not = icmp eq i32 %dec, 0
|
|
|
|
br i1 %cmp.not, label %while.end, label %while.body
|
|
|
|
|
|
|
|
while.end: ; preds = %while.body, %do.body
|
|
|
|
%pOut.1.lcssa = phi half* [ %pDst, %do.body ], [ %incdec.ptr12, %while.body ]
|
|
|
|
%state.0.lcssa = phi <8 x half> [ %9, %do.body ], [ %30, %while.body ]
|
|
|
|
%pIn.1.lcssa = phi half* [ %pIn.0, %do.body ], [ %incdec.ptr4, %while.body ]
|
|
|
|
br i1 %tobool.not, label %if.else, label %if.then
|
|
|
|
|
|
|
|
if.then: ; preds = %while.end
|
|
|
|
%31 = load half, half* %pIn.1.lcssa, align 2
|
|
|
|
%.splatinsert14 = insertelement <8 x half> poison, half %31, i32 0
|
|
|
|
%.splat15 = shufflevector <8 x half> %.splatinsert14, <8 x half> poison, <8 x i32> zeroinitializer
|
|
|
|
%32 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %4, <8 x half> %.splat15, <8 x half> %state.0.lcssa)
|
|
|
|
%33 = extractelement <8 x half> %32, i32 0
|
|
|
|
%.splat17 = shufflevector <8 x half> %32, <8 x half> poison, <8 x i32> zeroinitializer
|
|
|
|
%34 = tail call fast <8 x half> @llvm.fma.v8f16(<8 x half> %6, <8 x half> %.splat17, <8 x half> %32)
|
|
|
|
store half %33, half* %pOut.1.lcssa, align 2
|
|
|
|
%35 = extractelement <8 x half> %34, i32 1
|
|
|
|
store half %35, half* %pState.0, align 2
|
|
|
|
%36 = extractelement <8 x half> %34, i32 2
|
|
|
|
br label %if.end
|
|
|
|
|
|
|
|
if.else: ; preds = %while.end
|
|
|
|
%37 = extractelement <8 x half> %state.0.lcssa, i32 0
|
|
|
|
store half %37, half* %pState.0, align 2
|
|
|
|
%38 = extractelement <8 x half> %state.0.lcssa, i32 1
|
|
|
|
br label %if.end
|
|
|
|
|
|
|
|
if.end: ; preds = %if.else, %if.then
|
|
|
|
%.sink = phi half [ %38, %if.else ], [ %36, %if.then ]
|
|
|
|
%39 = getelementptr inbounds half, half* %pState.0, i32 1
|
|
|
|
store half %.sink, half* %39, align 2
|
|
|
|
%pState.1 = getelementptr inbounds half, half* %pState.0, i32 2
|
|
|
|
%dec23 = add i32 %stage.0, -1
|
|
|
|
%cmp24.not = icmp eq i32 %dec23, 0
|
|
|
|
br i1 %cmp24.not, label %do.end, label %do.body
|
|
|
|
|
|
|
|
do.end: ; preds = %if.end
|
|
|
|
ret void
|
|
|
|
}
|
|
|
|
|
2021-02-14 19:26:22 +01:00
|
|
|
define arm_aapcs_vfpcc half @vecAddAcrossF16Mve(<8 x half> %in) {
|
|
|
|
; CHECK-LABEL: vecAddAcrossF16Mve:
|
|
|
|
; CHECK: @ %bb.0: @ %entry
|
|
|
|
; CHECK-NEXT: vrev32.16 q1, q0
|
|
|
|
; CHECK-NEXT: vadd.f16 q0, q1, q0
|
|
|
|
; CHECK-NEXT: vrev64.32 q1, q0
|
|
|
|
; CHECK-NEXT: vadd.f16 q0, q0, q1
|
|
|
|
; CHECK-NEXT: vadd.f16 s0, s0, s2
|
|
|
|
; CHECK-NEXT: bx lr
|
|
|
|
entry:
|
|
|
|
%0 = shufflevector <8 x half> %in, <8 x half> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
|
|
|
|
%1 = fadd fast <8 x half> %0, %in
|
|
|
|
%2 = bitcast <8 x half> %1 to <4 x i32>
|
|
|
|
%3 = shufflevector <4 x i32> %2, <4 x i32> poison, <4 x i32> <i32 1, i32 undef, i32 3, i32 undef>
|
|
|
|
%4 = bitcast <4 x i32> %3 to <8 x half>
|
|
|
|
%5 = fadd fast <8 x half> %1, %4
|
|
|
|
%6 = extractelement <8 x half> %5, i32 0
|
|
|
|
%7 = extractelement <8 x half> %5, i32 4
|
|
|
|
%add = fadd fast half %6, %7
|
|
|
|
ret half %add
|
|
|
|
}
|
|
|
|
|
2021-02-08 11:50:23 +01:00
|
|
|
declare { i32, <8 x i16> } @llvm.arm.mve.vshlc.v8i16(<8 x i16>, i32, i32)
|
2020-03-20 09:25:19 +01:00
|
|
|
declare void @llvm.assume(i1)
|
|
|
|
declare <8 x i1> @llvm.arm.mve.vctp16(i32)
|
|
|
|
declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
|
|
|
|
declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32 immarg, <8 x i1>)
|