1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 11:13:28 +01:00

[MVE] VMOVX patterns

This adds fp16 VMOVX patterns, using the same patterns as rL362482 with some
adjustments for MVE. It allows us to move fp16 registers without going into and
out of gprs.

VMOVX is able to move the top bits from a fp16 in a fp reg into the bottom bits
of another register, zeroing the rest. This can be used for odd MVE register
lanes. The top bits are not read by fp16 instructions, so no move is required
there if we are dealing with even lanes.

Differential revision: https://reviews.llvm.org/D66793

llvm-svn: 370184
This commit is contained in:
David Green 2019-08-28 10:13:23 +00:00
parent 002a337ed8
commit 4ccc1cffed
14 changed files with 2405 additions and 4582 deletions

View File

@ -1346,8 +1346,12 @@ let Predicates = [HasMVEInt] in {
def : Pat<(insertelt (v8f16 MQPR:$src1), HPR:$src2, imm:$lane),
(MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS HPR:$src2, rGPR), imm:$lane)>;
def : Pat<(extractelt (v8f16 MQPR:$src), imm:$lane),
(COPY_TO_REGCLASS (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane), HPR)>;
def : Pat<(extractelt (v8f16 MQPR:$src), imm_even:$lane),
(EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_even:$lane))>;
def : Pat<(extractelt (v8f16 MQPR:$src), imm_odd:$lane),
(COPY_TO_REGCLASS
(VMOVH (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane))),
HPR)>;
def : Pat<(v4f32 (scalar_to_vector SPR:$src)),
(INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>;

View File

@ -932,60 +932,36 @@ entry:
define arm_aapcs_vfpcc <8 x half> @fdiv_f16(<8 x half> %in1, <8 x half> %in2) {
; CHECK-LABEL: fdiv_f16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.u16 r0, q1[0]
; CHECK-NEXT: vmov.u16 r1, q1[1]
; CHECK-NEXT: vmov s8, r0
; CHECK-NEXT: vmov.u16 r0, q0[0]
; CHECK-NEXT: vmov s10, r0
; CHECK-NEXT: vmov.u16 r2, q0[1]
; CHECK-NEXT: vdiv.f16 s8, s10, s8
; CHECK-NEXT: vmov s10, r2
; CHECK-NEXT: vdiv.f16 s8, s0, s4
; CHECK-NEXT: vmovx.f16 s10, s0
; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vmov s8, r1
; CHECK-NEXT: vmovx.f16 s8, s4
; CHECK-NEXT: vdiv.f16 s8, s10, s8
; CHECK-NEXT: vdiv.f16 s12, s1, s5
; CHECK-NEXT: vmov r1, s8
; CHECK-NEXT: vmov.16 q2[0], r0
; CHECK-NEXT: vmov.u16 r0, q1[2]
; CHECK-NEXT: vmov r0, s12
; CHECK-NEXT: vmovx.f16 s12, s5
; CHECK-NEXT: vmovx.f16 s14, s1
; CHECK-NEXT: vmov.16 q2[1], r1
; CHECK-NEXT: vmov s12, r0
; CHECK-NEXT: vmov.u16 r0, q0[2]
; CHECK-NEXT: vmov s14, r0
; CHECK-NEXT: vdiv.f16 s12, s14, s12
; CHECK-NEXT: vmov r0, s12
; CHECK-NEXT: vmov.16 q2[2], r0
; CHECK-NEXT: vmov.u16 r0, q1[3]
; CHECK-NEXT: vmov s12, r0
; CHECK-NEXT: vmov.u16 r0, q0[3]
; CHECK-NEXT: vmov s14, r0
; CHECK-NEXT: vdiv.f16 s12, s14, s12
; CHECK-NEXT: vmov r0, s12
; CHECK-NEXT: vdiv.f16 s12, s2, s6
; CHECK-NEXT: vmov.16 q2[3], r0
; CHECK-NEXT: vmov.u16 r0, q1[4]
; CHECK-NEXT: vmov s12, r0
; CHECK-NEXT: vmov.u16 r0, q0[4]
; CHECK-NEXT: vmov s14, r0
; CHECK-NEXT: vdiv.f16 s12, s14, s12
; CHECK-NEXT: vmov r0, s12
; CHECK-NEXT: vmovx.f16 s12, s6
; CHECK-NEXT: vmovx.f16 s14, s2
; CHECK-NEXT: vdiv.f16 s12, s14, s12
; CHECK-NEXT: vmov.16 q2[4], r0
; CHECK-NEXT: vmov.u16 r0, q1[5]
; CHECK-NEXT: vmov s12, r0
; CHECK-NEXT: vmov.u16 r0, q0[5]
; CHECK-NEXT: vmov s14, r0
; CHECK-NEXT: vdiv.f16 s12, s14, s12
; CHECK-NEXT: vmov r0, s12
; CHECK-NEXT: vdiv.f16 s12, s3, s7
; CHECK-NEXT: vmovx.f16 s4, s7
; CHECK-NEXT: vmovx.f16 s0, s3
; CHECK-NEXT: vmov.16 q2[5], r0
; CHECK-NEXT: vmov.u16 r0, q1[6]
; CHECK-NEXT: vmov s12, r0
; CHECK-NEXT: vmov.u16 r0, q0[6]
; CHECK-NEXT: vmov s14, r0
; CHECK-NEXT: vdiv.f16 s12, s14, s12
; CHECK-NEXT: vmov r0, s12
; CHECK-NEXT: vmov.16 q2[6], r0
; CHECK-NEXT: vmov.u16 r0, q1[7]
; CHECK-NEXT: vmov s4, r0
; CHECK-NEXT: vmov.u16 r0, q0[7]
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vdiv.f16 s0, s0, s4
; CHECK-NEXT: vmov.16 q2[6], r0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vmov.16 q2[7], r0
; CHECK-NEXT: vmov q0, q2
@ -1005,25 +981,19 @@ define arm_aapcs_vfpcc <8 x half> @frem_f16(<8 x half> %in1, <8 x half> %in2) {
; CHECK-NEXT: .pad #64
; CHECK-NEXT: sub sp, #64
; CHECK-NEXT: vmov q4, q0
; CHECK-NEXT: vmov.u16 r0, q1[0]
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vmov.u16 r0, q4[0]
; CHECK-NEXT: vmov s2, r0
; CHECK-NEXT: vmov q5, q1
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
; CHECK-NEXT: vstr s2, [sp, #56]
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
; CHECK-NEXT: vcvtb.f32.f16 s0, s16
; CHECK-NEXT: vstr s0, [sp, #56]
; CHECK-NEXT: vcvtb.f32.f16 s0, s20
; CHECK-NEXT: vstr s0, [sp, #60]
; CHECK-NEXT: ldrd r0, r1, [sp, #56]
; CHECK-NEXT: bl fmodf
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vmov.u16 r0, q5[1]
; CHECK-NEXT: vmovx.f16 s2, s16
; CHECK-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vmov.u16 r0, q4[1]
; CHECK-NEXT: vmov s2, r0
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: vmovx.f16 s0, s20
; CHECK-NEXT: vstr s2, [sp, #48]
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
; CHECK-NEXT: vstr s0, [sp, #52]
@ -1033,85 +1003,67 @@ define arm_aapcs_vfpcc <8 x half> @frem_f16(<8 x half> %in1, <8 x half> %in2) {
; CHECK-NEXT: vmov.16 q6[0], r4
; CHECK-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vmov.16 q6[1], r0
; CHECK-NEXT: vmov.u16 r0, q5[2]
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vmov.u16 r0, q4[2]
; CHECK-NEXT: vmov s2, r0
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
; CHECK-NEXT: vstr s2, [sp, #40]
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
; CHECK-NEXT: vcvtb.f32.f16 s0, s17
; CHECK-NEXT: vstr s0, [sp, #40]
; CHECK-NEXT: vcvtb.f32.f16 s0, s21
; CHECK-NEXT: vstr s0, [sp, #44]
; CHECK-NEXT: vmov.16 q6[1], r0
; CHECK-NEXT: ldrd r0, r1, [sp, #40]
; CHECK-NEXT: bl fmodf
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vmovx.f16 s2, s17
; CHECK-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vmov.16 q6[2], r0
; CHECK-NEXT: vmov.u16 r0, q5[3]
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vmov.u16 r0, q4[3]
; CHECK-NEXT: vmov s2, r0
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vmovx.f16 s0, s21
; CHECK-NEXT: vstr s2, [sp, #32]
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
; CHECK-NEXT: vstr s0, [sp, #36]
; CHECK-NEXT: vmov.16 q6[2], r0
; CHECK-NEXT: ldrd r0, r1, [sp, #32]
; CHECK-NEXT: bl fmodf
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vmov.16 q6[3], r0
; CHECK-NEXT: vmov.u16 r0, q5[4]
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vmov.u16 r0, q4[4]
; CHECK-NEXT: vmov s2, r0
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
; CHECK-NEXT: vstr s2, [sp, #24]
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
; CHECK-NEXT: vcvtb.f32.f16 s0, s18
; CHECK-NEXT: vstr s0, [sp, #24]
; CHECK-NEXT: vcvtb.f32.f16 s0, s22
; CHECK-NEXT: vstr s0, [sp, #28]
; CHECK-NEXT: vmov.16 q6[3], r0
; CHECK-NEXT: ldrd r0, r1, [sp, #24]
; CHECK-NEXT: bl fmodf
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vmovx.f16 s2, s18
; CHECK-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vmov.16 q6[4], r0
; CHECK-NEXT: vmov.u16 r0, q5[5]
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vmov.u16 r0, q4[5]
; CHECK-NEXT: vmov s2, r0
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vmovx.f16 s0, s22
; CHECK-NEXT: vstr s2, [sp, #16]
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
; CHECK-NEXT: vstr s0, [sp, #20]
; CHECK-NEXT: vmov.16 q6[4], r0
; CHECK-NEXT: ldrd r0, r1, [sp, #16]
; CHECK-NEXT: bl fmodf
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vmov.16 q6[5], r0
; CHECK-NEXT: vmov.u16 r0, q5[6]
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vmov.u16 r0, q4[6]
; CHECK-NEXT: vmov s2, r0
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
; CHECK-NEXT: vstr s2, [sp, #8]
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
; CHECK-NEXT: vcvtb.f32.f16 s0, s19
; CHECK-NEXT: vstr s0, [sp, #8]
; CHECK-NEXT: vcvtb.f32.f16 s0, s23
; CHECK-NEXT: vstr s0, [sp, #12]
; CHECK-NEXT: vmov.16 q6[5], r0
; CHECK-NEXT: ldrd r0, r1, [sp, #8]
; CHECK-NEXT: bl fmodf
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vmovx.f16 s2, s19
; CHECK-NEXT: vcvtb.f16.f32 s0, s0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vmov.16 q6[6], r0
; CHECK-NEXT: vmov.u16 r0, q5[7]
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vmov.u16 r0, q4[7]
; CHECK-NEXT: vmov s2, r0
; CHECK-NEXT: vcvtb.f32.f16 s2, s2
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vmovx.f16 s0, s23
; CHECK-NEXT: vstr s2, [sp]
; CHECK-NEXT: vcvtb.f32.f16 s0, s0
; CHECK-NEXT: vstr s0, [sp, #4]
; CHECK-NEXT: vmov.16 q6[6], r0
; CHECK-NEXT: ldrd r0, r1, [sp]
; CHECK-NEXT: bl fmodf
; CHECK-NEXT: vmov s0, r0

View File

@ -19,76 +19,43 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v1(<8 x half> %src1, <8 x half> %src2,
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: .vsave {d8, d9, d10}
; CHECK-MVE-NEXT: vpush {d8, d9, d10}
; CHECK-MVE-NEXT: vmov.u16 r0, q2[0]
; CHECK-MVE-NEXT: vmov.u16 r1, q1[0]
; CHECK-MVE-NEXT: vmov.u16 r2, q0[0]
; CHECK-MVE-NEXT: vmov s14, r1
; CHECK-MVE-NEXT: vmov s13, r2
; CHECK-MVE-NEXT: vmov.u16 r1, q2[1]
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r2, q1[1]
; CHECK-MVE-NEXT: vmla.f16 s13, s14, s12
; CHECK-MVE-NEXT: vmov.u16 r3, q0[1]
; CHECK-MVE-NEXT: vmov r0, s13
; CHECK-MVE-NEXT: vmov s14, r2
; CHECK-MVE-NEXT: vmov.u16 r2, q0[2]
; CHECK-MVE-NEXT: vmov s12, r1
; CHECK-MVE-NEXT: vmov s13, r3
; CHECK-MVE-NEXT: vmla.f16 s13, s14, s12
; CHECK-MVE-NEXT: vmov s20, r2
; CHECK-MVE-NEXT: vmov r1, s13
; CHECK-MVE-NEXT: vmov.16 q3[0], r0
; CHECK-MVE-NEXT: vmov.16 q3[1], r1
; CHECK-MVE-NEXT: vmov.u16 r0, q2[2]
; CHECK-MVE-NEXT: vmov.u16 r1, q1[2]
; CHECK-MVE-NEXT: vmov s16, r0
; CHECK-MVE-NEXT: vmov s18, r1
; CHECK-MVE-NEXT: vmov.u16 r1, q1[4]
; CHECK-MVE-NEXT: vmla.f16 s20, s18, s16
; CHECK-MVE-NEXT: vmov.u16 r2, q0[4]
; CHECK-MVE-NEXT: vmov r0, s20
; CHECK-MVE-NEXT: vmov.16 q3[2], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q2[3]
; CHECK-MVE-NEXT: vmov s16, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[3]
; CHECK-MVE-NEXT: vmov s18, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[3]
; CHECK-MVE-NEXT: vmov s20, r0
; CHECK-MVE-NEXT: vmla.f16 s20, s18, s16
; CHECK-MVE-NEXT: vmov s18, r1
; CHECK-MVE-NEXT: vmov r0, s20
; CHECK-MVE-NEXT: vmov.u16 r1, q1[6]
; CHECK-MVE-NEXT: vmov.16 q3[3], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q2[4]
; CHECK-MVE-NEXT: vmov s20, r2
; CHECK-MVE-NEXT: vmov.u16 r2, q0[6]
; CHECK-MVE-NEXT: vmov s16, r0
; CHECK-MVE-NEXT: vmla.f16 s20, s18, s16
; CHECK-MVE-NEXT: vmov r0, s20
; CHECK-MVE-NEXT: vmov.16 q3[4], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q2[5]
; CHECK-MVE-NEXT: vmov s16, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[5]
; CHECK-MVE-NEXT: vmov s18, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[5]
; CHECK-MVE-NEXT: vmov s20, r0
; CHECK-MVE-NEXT: vmla.f16 s20, s18, s16
; CHECK-MVE-NEXT: vmov s18, r1
; CHECK-MVE-NEXT: vmov r0, s20
; CHECK-MVE-NEXT: vmov.16 q3[5], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q2[6]
; CHECK-MVE-NEXT: vmov s16, r0
; CHECK-MVE-NEXT: vmov s20, r2
; CHECK-MVE-NEXT: vmla.f16 s20, s18, s16
; CHECK-MVE-NEXT: vmov r0, s20
; CHECK-MVE-NEXT: vmov.16 q3[6], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q2[7]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[7]
; CHECK-MVE-NEXT: vmov s4, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[7]
; CHECK-MVE-NEXT: vmov s0, r0
; CHECK-MVE-NEXT: vmovx.f16 s13, s0
; CHECK-MVE-NEXT: vmla.f16 s0, s4, s8
; CHECK-MVE-NEXT: vmovx.f16 s12, s8
; CHECK-MVE-NEXT: vmovx.f16 s14, s4
; CHECK-MVE-NEXT: vmov.f32 s16, s1
; CHECK-MVE-NEXT: vmla.f16 s13, s14, s12
; CHECK-MVE-NEXT: vmov r1, s0
; CHECK-MVE-NEXT: vmla.f16 s16, s5, s9
; CHECK-MVE-NEXT: vmov r0, s13
; CHECK-MVE-NEXT: vmov.16 q3[0], r1
; CHECK-MVE-NEXT: vmov.16 q3[1], r0
; CHECK-MVE-NEXT: vmov r0, s16
; CHECK-MVE-NEXT: vmovx.f16 s16, s9
; CHECK-MVE-NEXT: vmovx.f16 s18, s5
; CHECK-MVE-NEXT: vmovx.f16 s20, s1
; CHECK-MVE-NEXT: vmov.16 q3[2], r0
; CHECK-MVE-NEXT: vmla.f16 s20, s18, s16
; CHECK-MVE-NEXT: vmov.f32 s16, s2
; CHECK-MVE-NEXT: vmov r0, s20
; CHECK-MVE-NEXT: vmla.f16 s16, s6, s10
; CHECK-MVE-NEXT: vmov.16 q3[3], r0
; CHECK-MVE-NEXT: vmov r0, s16
; CHECK-MVE-NEXT: vmovx.f16 s16, s10
; CHECK-MVE-NEXT: vmovx.f16 s18, s6
; CHECK-MVE-NEXT: vmovx.f16 s20, s2
; CHECK-MVE-NEXT: vmov.16 q3[4], r0
; CHECK-MVE-NEXT: vmla.f16 s20, s18, s16
; CHECK-MVE-NEXT: vmov.f32 s16, s3
; CHECK-MVE-NEXT: vmov r0, s20
; CHECK-MVE-NEXT: vmla.f16 s16, s7, s11
; CHECK-MVE-NEXT: vmovx.f16 s8, s11
; CHECK-MVE-NEXT: vmovx.f16 s4, s7
; CHECK-MVE-NEXT: vmovx.f16 s0, s3
; CHECK-MVE-NEXT: vmov.16 q3[5], r0
; CHECK-MVE-NEXT: vmov r0, s16
; CHECK-MVE-NEXT: vmla.f16 s0, s4, s8
; CHECK-MVE-NEXT: vmov.16 q3[6], r0
; CHECK-MVE-NEXT: vmov r0, s0
; CHECK-MVE-NEXT: vmov.16 q3[7], r0
; CHECK-MVE-NEXT: vmov q0, q3
@ -116,76 +83,43 @@ define arm_aapcs_vfpcc <8 x half> @vfma16_v2(<8 x half> %src1, <8 x half> %src2,
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: .vsave {d8, d9, d10}
; CHECK-MVE-NEXT: vpush {d8, d9, d10}
; CHECK-MVE-NEXT: vmov.u16 r0, q2[0]
; CHECK-MVE-NEXT: vmov.u16 r1, q1[0]
; CHECK-MVE-NEXT: vmov.u16 r2, q0[0]
; CHECK-MVE-NEXT: vmov s14, r1
; CHECK-MVE-NEXT: vmov s13, r2
; CHECK-MVE-NEXT: vmov.u16 r1, q2[1]
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r2, q1[1]
; CHECK-MVE-NEXT: vmla.f16 s13, s14, s12
; CHECK-MVE-NEXT: vmov.u16 r3, q0[1]
; CHECK-MVE-NEXT: vmov r0, s13
; CHECK-MVE-NEXT: vmov s14, r2
; CHECK-MVE-NEXT: vmov.u16 r2, q0[2]
; CHECK-MVE-NEXT: vmov s12, r1
; CHECK-MVE-NEXT: vmov s13, r3
; CHECK-MVE-NEXT: vmla.f16 s13, s14, s12
; CHECK-MVE-NEXT: vmov s20, r2
; CHECK-MVE-NEXT: vmov r1, s13
; CHECK-MVE-NEXT: vmov.16 q3[0], r0
; CHECK-MVE-NEXT: vmov.16 q3[1], r1
; CHECK-MVE-NEXT: vmov.u16 r0, q2[2]
; CHECK-MVE-NEXT: vmov.u16 r1, q1[2]
; CHECK-MVE-NEXT: vmov s16, r0
; CHECK-MVE-NEXT: vmov s18, r1
; CHECK-MVE-NEXT: vmov.u16 r1, q1[4]
; CHECK-MVE-NEXT: vmla.f16 s20, s18, s16
; CHECK-MVE-NEXT: vmov.u16 r2, q0[4]
; CHECK-MVE-NEXT: vmov r0, s20
; CHECK-MVE-NEXT: vmov.16 q3[2], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q2[3]
; CHECK-MVE-NEXT: vmov s16, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[3]
; CHECK-MVE-NEXT: vmov s18, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[3]
; CHECK-MVE-NEXT: vmov s20, r0
; CHECK-MVE-NEXT: vmla.f16 s20, s18, s16
; CHECK-MVE-NEXT: vmov s18, r1
; CHECK-MVE-NEXT: vmov r0, s20
; CHECK-MVE-NEXT: vmov.u16 r1, q1[6]
; CHECK-MVE-NEXT: vmov.16 q3[3], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q2[4]
; CHECK-MVE-NEXT: vmov s20, r2
; CHECK-MVE-NEXT: vmov.u16 r2, q0[6]
; CHECK-MVE-NEXT: vmov s16, r0
; CHECK-MVE-NEXT: vmla.f16 s20, s18, s16
; CHECK-MVE-NEXT: vmov r0, s20
; CHECK-MVE-NEXT: vmov.16 q3[4], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q2[5]
; CHECK-MVE-NEXT: vmov s16, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[5]
; CHECK-MVE-NEXT: vmov s18, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[5]
; CHECK-MVE-NEXT: vmov s20, r0
; CHECK-MVE-NEXT: vmla.f16 s20, s18, s16
; CHECK-MVE-NEXT: vmov s18, r1
; CHECK-MVE-NEXT: vmov r0, s20
; CHECK-MVE-NEXT: vmov.16 q3[5], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q2[6]
; CHECK-MVE-NEXT: vmov s16, r0
; CHECK-MVE-NEXT: vmov s20, r2
; CHECK-MVE-NEXT: vmla.f16 s20, s18, s16
; CHECK-MVE-NEXT: vmov r0, s20
; CHECK-MVE-NEXT: vmov.16 q3[6], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q2[7]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[7]
; CHECK-MVE-NEXT: vmov s4, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[7]
; CHECK-MVE-NEXT: vmov s0, r0
; CHECK-MVE-NEXT: vmovx.f16 s13, s0
; CHECK-MVE-NEXT: vmla.f16 s0, s4, s8
; CHECK-MVE-NEXT: vmovx.f16 s12, s8
; CHECK-MVE-NEXT: vmovx.f16 s14, s4
; CHECK-MVE-NEXT: vmov.f32 s16, s1
; CHECK-MVE-NEXT: vmla.f16 s13, s14, s12
; CHECK-MVE-NEXT: vmov r1, s0
; CHECK-MVE-NEXT: vmla.f16 s16, s5, s9
; CHECK-MVE-NEXT: vmov r0, s13
; CHECK-MVE-NEXT: vmov.16 q3[0], r1
; CHECK-MVE-NEXT: vmov.16 q3[1], r0
; CHECK-MVE-NEXT: vmov r0, s16
; CHECK-MVE-NEXT: vmovx.f16 s16, s9
; CHECK-MVE-NEXT: vmovx.f16 s18, s5
; CHECK-MVE-NEXT: vmovx.f16 s20, s1
; CHECK-MVE-NEXT: vmov.16 q3[2], r0
; CHECK-MVE-NEXT: vmla.f16 s20, s18, s16
; CHECK-MVE-NEXT: vmov.f32 s16, s2
; CHECK-MVE-NEXT: vmov r0, s20
; CHECK-MVE-NEXT: vmla.f16 s16, s6, s10
; CHECK-MVE-NEXT: vmov.16 q3[3], r0
; CHECK-MVE-NEXT: vmov r0, s16
; CHECK-MVE-NEXT: vmovx.f16 s16, s10
; CHECK-MVE-NEXT: vmovx.f16 s18, s6
; CHECK-MVE-NEXT: vmovx.f16 s20, s2
; CHECK-MVE-NEXT: vmov.16 q3[4], r0
; CHECK-MVE-NEXT: vmla.f16 s20, s18, s16
; CHECK-MVE-NEXT: vmov.f32 s16, s3
; CHECK-MVE-NEXT: vmov r0, s20
; CHECK-MVE-NEXT: vmla.f16 s16, s7, s11
; CHECK-MVE-NEXT: vmovx.f16 s8, s11
; CHECK-MVE-NEXT: vmovx.f16 s4, s7
; CHECK-MVE-NEXT: vmovx.f16 s0, s3
; CHECK-MVE-NEXT: vmov.16 q3[5], r0
; CHECK-MVE-NEXT: vmov r0, s16
; CHECK-MVE-NEXT: vmla.f16 s0, s4, s8
; CHECK-MVE-NEXT: vmov.16 q3[6], r0
; CHECK-MVE-NEXT: vmov r0, s0
; CHECK-MVE-NEXT: vmov.16 q3[7], r0
; CHECK-MVE-NEXT: vmov q0, q3
@ -213,76 +147,43 @@ define arm_aapcs_vfpcc <8 x half> @vfms16(<8 x half> %src1, <8 x half> %src2, <8
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: .vsave {d8, d9, d10}
; CHECK-MVE-NEXT: vpush {d8, d9, d10}
; CHECK-MVE-NEXT: vmov.u16 r0, q2[0]
; CHECK-MVE-NEXT: vmov.u16 r1, q1[0]
; CHECK-MVE-NEXT: vmov.u16 r2, q0[0]
; CHECK-MVE-NEXT: vmov s14, r1
; CHECK-MVE-NEXT: vmov s13, r2
; CHECK-MVE-NEXT: vmov.u16 r1, q2[1]
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r2, q1[1]
; CHECK-MVE-NEXT: vmls.f16 s13, s14, s12
; CHECK-MVE-NEXT: vmov.u16 r3, q0[1]
; CHECK-MVE-NEXT: vmov r0, s13
; CHECK-MVE-NEXT: vmov s14, r2
; CHECK-MVE-NEXT: vmov.u16 r2, q0[2]
; CHECK-MVE-NEXT: vmov s12, r1
; CHECK-MVE-NEXT: vmov s13, r3
; CHECK-MVE-NEXT: vmls.f16 s13, s14, s12
; CHECK-MVE-NEXT: vmov s20, r2
; CHECK-MVE-NEXT: vmov r1, s13
; CHECK-MVE-NEXT: vmov.16 q3[0], r0
; CHECK-MVE-NEXT: vmov.16 q3[1], r1
; CHECK-MVE-NEXT: vmov.u16 r0, q2[2]
; CHECK-MVE-NEXT: vmov.u16 r1, q1[2]
; CHECK-MVE-NEXT: vmov s16, r0
; CHECK-MVE-NEXT: vmov s18, r1
; CHECK-MVE-NEXT: vmov.u16 r1, q1[4]
; CHECK-MVE-NEXT: vmls.f16 s20, s18, s16
; CHECK-MVE-NEXT: vmov.u16 r2, q0[4]
; CHECK-MVE-NEXT: vmov r0, s20
; CHECK-MVE-NEXT: vmov.16 q3[2], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q2[3]
; CHECK-MVE-NEXT: vmov s16, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[3]
; CHECK-MVE-NEXT: vmov s18, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[3]
; CHECK-MVE-NEXT: vmov s20, r0
; CHECK-MVE-NEXT: vmls.f16 s20, s18, s16
; CHECK-MVE-NEXT: vmov s18, r1
; CHECK-MVE-NEXT: vmov r0, s20
; CHECK-MVE-NEXT: vmov.u16 r1, q1[6]
; CHECK-MVE-NEXT: vmov.16 q3[3], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q2[4]
; CHECK-MVE-NEXT: vmov s20, r2
; CHECK-MVE-NEXT: vmov.u16 r2, q0[6]
; CHECK-MVE-NEXT: vmov s16, r0
; CHECK-MVE-NEXT: vmls.f16 s20, s18, s16
; CHECK-MVE-NEXT: vmov r0, s20
; CHECK-MVE-NEXT: vmov.16 q3[4], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q2[5]
; CHECK-MVE-NEXT: vmov s16, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[5]
; CHECK-MVE-NEXT: vmov s18, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[5]
; CHECK-MVE-NEXT: vmov s20, r0
; CHECK-MVE-NEXT: vmls.f16 s20, s18, s16
; CHECK-MVE-NEXT: vmov s18, r1
; CHECK-MVE-NEXT: vmov r0, s20
; CHECK-MVE-NEXT: vmov.16 q3[5], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q2[6]
; CHECK-MVE-NEXT: vmov s16, r0
; CHECK-MVE-NEXT: vmov s20, r2
; CHECK-MVE-NEXT: vmls.f16 s20, s18, s16
; CHECK-MVE-NEXT: vmov r0, s20
; CHECK-MVE-NEXT: vmov.16 q3[6], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q2[7]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[7]
; CHECK-MVE-NEXT: vmov s4, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[7]
; CHECK-MVE-NEXT: vmov s0, r0
; CHECK-MVE-NEXT: vmovx.f16 s13, s0
; CHECK-MVE-NEXT: vmls.f16 s0, s4, s8
; CHECK-MVE-NEXT: vmovx.f16 s12, s8
; CHECK-MVE-NEXT: vmovx.f16 s14, s4
; CHECK-MVE-NEXT: vmov.f32 s16, s1
; CHECK-MVE-NEXT: vmls.f16 s13, s14, s12
; CHECK-MVE-NEXT: vmov r1, s0
; CHECK-MVE-NEXT: vmls.f16 s16, s5, s9
; CHECK-MVE-NEXT: vmov r0, s13
; CHECK-MVE-NEXT: vmov.16 q3[0], r1
; CHECK-MVE-NEXT: vmov.16 q3[1], r0
; CHECK-MVE-NEXT: vmov r0, s16
; CHECK-MVE-NEXT: vmovx.f16 s16, s9
; CHECK-MVE-NEXT: vmovx.f16 s18, s5
; CHECK-MVE-NEXT: vmovx.f16 s20, s1
; CHECK-MVE-NEXT: vmov.16 q3[2], r0
; CHECK-MVE-NEXT: vmls.f16 s20, s18, s16
; CHECK-MVE-NEXT: vmov.f32 s16, s2
; CHECK-MVE-NEXT: vmov r0, s20
; CHECK-MVE-NEXT: vmls.f16 s16, s6, s10
; CHECK-MVE-NEXT: vmov.16 q3[3], r0
; CHECK-MVE-NEXT: vmov r0, s16
; CHECK-MVE-NEXT: vmovx.f16 s16, s10
; CHECK-MVE-NEXT: vmovx.f16 s18, s6
; CHECK-MVE-NEXT: vmovx.f16 s20, s2
; CHECK-MVE-NEXT: vmov.16 q3[4], r0
; CHECK-MVE-NEXT: vmls.f16 s20, s18, s16
; CHECK-MVE-NEXT: vmov.f32 s16, s3
; CHECK-MVE-NEXT: vmov r0, s20
; CHECK-MVE-NEXT: vmls.f16 s16, s7, s11
; CHECK-MVE-NEXT: vmovx.f16 s8, s11
; CHECK-MVE-NEXT: vmovx.f16 s4, s7
; CHECK-MVE-NEXT: vmovx.f16 s0, s3
; CHECK-MVE-NEXT: vmov.16 q3[5], r0
; CHECK-MVE-NEXT: vmov r0, s16
; CHECK-MVE-NEXT: vmls.f16 s0, s4, s8
; CHECK-MVE-NEXT: vmov.16 q3[6], r0
; CHECK-MVE-NEXT: vmov r0, s0
; CHECK-MVE-NEXT: vmov.16 q3[7], r0
; CHECK-MVE-NEXT: vmov q0, q3

File diff suppressed because it is too large Load Diff

View File

@ -5,44 +5,32 @@
define arm_aapcs_vfpcc <8 x half> @fneg_float16_t(<8 x half> %src) {
; CHECK-MVE-LABEL: fneg_float16_t:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov.u16 r0, q0[0]
; CHECK-MVE-NEXT: vmov.u16 r1, q0[1]
; CHECK-MVE-NEXT: vmov s4, r0
; CHECK-MVE-NEXT: vmovx.f16 s4, s0
; CHECK-MVE-NEXT: vneg.f16 s8, s1
; CHECK-MVE-NEXT: vneg.f16 s4, s4
; CHECK-MVE-NEXT: vmov r0, s4
; CHECK-MVE-NEXT: vmov s4, r1
; CHECK-MVE-NEXT: vneg.f16 s4, s4
; CHECK-MVE-NEXT: vneg.f16 s4, s0
; CHECK-MVE-NEXT: vmov r1, s4
; CHECK-MVE-NEXT: vmov.16 q1[0], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[2]
; CHECK-MVE-NEXT: vmov.16 q1[1], r1
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vneg.f16 s8, s8
; CHECK-MVE-NEXT: vmovx.f16 s0, s3
; CHECK-MVE-NEXT: vmov.16 q1[0], r1
; CHECK-MVE-NEXT: vneg.f16 s0, s0
; CHECK-MVE-NEXT: vmov.16 q1[1], r0
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmovx.f16 s8, s1
; CHECK-MVE-NEXT: vmov.16 q1[2], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[3]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vneg.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vneg.f16 s8, s2
; CHECK-MVE-NEXT: vmov.16 q1[3], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[4]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vneg.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmovx.f16 s8, s2
; CHECK-MVE-NEXT: vmov.16 q1[4], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[5]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vneg.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vneg.f16 s8, s3
; CHECK-MVE-NEXT: vmov.16 q1[5], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[6]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vneg.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmov.16 q1[6], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[7]
; CHECK-MVE-NEXT: vmov s0, r0
; CHECK-MVE-NEXT: vneg.f16 s0, s0
; CHECK-MVE-NEXT: vmov r0, s0
; CHECK-MVE-NEXT: vmov.16 q1[7], r0
; CHECK-MVE-NEXT: vmov q0, q1
@ -112,44 +100,32 @@ entry:
define arm_aapcs_vfpcc <8 x half> @fabs_float16_t(<8 x half> %src) {
; CHECK-MVE-LABEL: fabs_float16_t:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov.u16 r0, q0[0]
; CHECK-MVE-NEXT: vmov.u16 r1, q0[1]
; CHECK-MVE-NEXT: vmov s4, r0
; CHECK-MVE-NEXT: vmovx.f16 s4, s0
; CHECK-MVE-NEXT: vabs.f16 s8, s1
; CHECK-MVE-NEXT: vabs.f16 s4, s4
; CHECK-MVE-NEXT: vmov r0, s4
; CHECK-MVE-NEXT: vmov s4, r1
; CHECK-MVE-NEXT: vabs.f16 s4, s4
; CHECK-MVE-NEXT: vabs.f16 s4, s0
; CHECK-MVE-NEXT: vmov r1, s4
; CHECK-MVE-NEXT: vmov.16 q1[0], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[2]
; CHECK-MVE-NEXT: vmov.16 q1[1], r1
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vabs.f16 s8, s8
; CHECK-MVE-NEXT: vmovx.f16 s0, s3
; CHECK-MVE-NEXT: vmov.16 q1[0], r1
; CHECK-MVE-NEXT: vabs.f16 s0, s0
; CHECK-MVE-NEXT: vmov.16 q1[1], r0
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmovx.f16 s8, s1
; CHECK-MVE-NEXT: vmov.16 q1[2], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[3]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vabs.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vabs.f16 s8, s2
; CHECK-MVE-NEXT: vmov.16 q1[3], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[4]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vabs.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmovx.f16 s8, s2
; CHECK-MVE-NEXT: vmov.16 q1[4], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[5]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vabs.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vabs.f16 s8, s3
; CHECK-MVE-NEXT: vmov.16 q1[5], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[6]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vabs.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmov.16 q1[6], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[7]
; CHECK-MVE-NEXT: vmov s0, r0
; CHECK-MVE-NEXT: vabs.f16 s0, s0
; CHECK-MVE-NEXT: vmov r0, s0
; CHECK-MVE-NEXT: vmov.16 q1[7], r0
; CHECK-MVE-NEXT: vmov q0, q1

View File

@ -24,44 +24,32 @@ entry:
define arm_aapcs_vfpcc <8 x half> @fceil_float16_t(<8 x half> %src) {
; CHECK-MVE-LABEL: fceil_float16_t:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov.u16 r0, q0[0]
; CHECK-MVE-NEXT: vmov.u16 r1, q0[1]
; CHECK-MVE-NEXT: vmov s4, r0
; CHECK-MVE-NEXT: vmovx.f16 s4, s0
; CHECK-MVE-NEXT: vrintp.f16 s8, s1
; CHECK-MVE-NEXT: vrintp.f16 s4, s4
; CHECK-MVE-NEXT: vmov r0, s4
; CHECK-MVE-NEXT: vmov s4, r1
; CHECK-MVE-NEXT: vrintp.f16 s4, s4
; CHECK-MVE-NEXT: vrintp.f16 s4, s0
; CHECK-MVE-NEXT: vmov r1, s4
; CHECK-MVE-NEXT: vmov.16 q1[0], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[2]
; CHECK-MVE-NEXT: vmov.16 q1[1], r1
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrintp.f16 s8, s8
; CHECK-MVE-NEXT: vmovx.f16 s0, s3
; CHECK-MVE-NEXT: vmov.16 q1[0], r1
; CHECK-MVE-NEXT: vrintp.f16 s0, s0
; CHECK-MVE-NEXT: vmov.16 q1[1], r0
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmovx.f16 s8, s1
; CHECK-MVE-NEXT: vmov.16 q1[2], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[3]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrintp.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vrintp.f16 s8, s2
; CHECK-MVE-NEXT: vmov.16 q1[3], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[4]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrintp.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmovx.f16 s8, s2
; CHECK-MVE-NEXT: vmov.16 q1[4], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[5]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrintp.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vrintp.f16 s8, s3
; CHECK-MVE-NEXT: vmov.16 q1[5], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[6]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrintp.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmov.16 q1[6], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[7]
; CHECK-MVE-NEXT: vmov s0, r0
; CHECK-MVE-NEXT: vrintp.f16 s0, s0
; CHECK-MVE-NEXT: vmov r0, s0
; CHECK-MVE-NEXT: vmov.16 q1[7], r0
; CHECK-MVE-NEXT: vmov q0, q1
@ -122,44 +110,32 @@ entry:
define arm_aapcs_vfpcc <8 x half> @ftrunc_float16_t(<8 x half> %src) {
; CHECK-MVE-LABEL: ftrunc_float16_t:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov.u16 r0, q0[0]
; CHECK-MVE-NEXT: vmov.u16 r1, q0[1]
; CHECK-MVE-NEXT: vmov s4, r0
; CHECK-MVE-NEXT: vmovx.f16 s4, s0
; CHECK-MVE-NEXT: vrintz.f16 s8, s1
; CHECK-MVE-NEXT: vrintz.f16 s4, s4
; CHECK-MVE-NEXT: vmov r0, s4
; CHECK-MVE-NEXT: vmov s4, r1
; CHECK-MVE-NEXT: vrintz.f16 s4, s4
; CHECK-MVE-NEXT: vrintz.f16 s4, s0
; CHECK-MVE-NEXT: vmov r1, s4
; CHECK-MVE-NEXT: vmov.16 q1[0], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[2]
; CHECK-MVE-NEXT: vmov.16 q1[1], r1
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrintz.f16 s8, s8
; CHECK-MVE-NEXT: vmovx.f16 s0, s3
; CHECK-MVE-NEXT: vmov.16 q1[0], r1
; CHECK-MVE-NEXT: vrintz.f16 s0, s0
; CHECK-MVE-NEXT: vmov.16 q1[1], r0
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmovx.f16 s8, s1
; CHECK-MVE-NEXT: vmov.16 q1[2], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[3]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrintz.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vrintz.f16 s8, s2
; CHECK-MVE-NEXT: vmov.16 q1[3], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[4]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrintz.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmovx.f16 s8, s2
; CHECK-MVE-NEXT: vmov.16 q1[4], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[5]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrintz.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vrintz.f16 s8, s3
; CHECK-MVE-NEXT: vmov.16 q1[5], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[6]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrintz.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmov.16 q1[6], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[7]
; CHECK-MVE-NEXT: vmov s0, r0
; CHECK-MVE-NEXT: vrintz.f16 s0, s0
; CHECK-MVE-NEXT: vmov r0, s0
; CHECK-MVE-NEXT: vmov.16 q1[7], r0
; CHECK-MVE-NEXT: vmov q0, q1
@ -220,44 +196,32 @@ entry:
define arm_aapcs_vfpcc <8 x half> @frint_float16_t(<8 x half> %src) {
; CHECK-MVE-LABEL: frint_float16_t:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov.u16 r0, q0[0]
; CHECK-MVE-NEXT: vmov.u16 r1, q0[1]
; CHECK-MVE-NEXT: vmov s4, r0
; CHECK-MVE-NEXT: vmovx.f16 s4, s0
; CHECK-MVE-NEXT: vrintx.f16 s8, s1
; CHECK-MVE-NEXT: vrintx.f16 s4, s4
; CHECK-MVE-NEXT: vmov r0, s4
; CHECK-MVE-NEXT: vmov s4, r1
; CHECK-MVE-NEXT: vrintx.f16 s4, s4
; CHECK-MVE-NEXT: vrintx.f16 s4, s0
; CHECK-MVE-NEXT: vmov r1, s4
; CHECK-MVE-NEXT: vmov.16 q1[0], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[2]
; CHECK-MVE-NEXT: vmov.16 q1[1], r1
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrintx.f16 s8, s8
; CHECK-MVE-NEXT: vmovx.f16 s0, s3
; CHECK-MVE-NEXT: vmov.16 q1[0], r1
; CHECK-MVE-NEXT: vrintx.f16 s0, s0
; CHECK-MVE-NEXT: vmov.16 q1[1], r0
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmovx.f16 s8, s1
; CHECK-MVE-NEXT: vmov.16 q1[2], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[3]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrintx.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vrintx.f16 s8, s2
; CHECK-MVE-NEXT: vmov.16 q1[3], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[4]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrintx.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmovx.f16 s8, s2
; CHECK-MVE-NEXT: vmov.16 q1[4], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[5]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrintx.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vrintx.f16 s8, s3
; CHECK-MVE-NEXT: vmov.16 q1[5], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[6]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrintx.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmov.16 q1[6], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[7]
; CHECK-MVE-NEXT: vmov s0, r0
; CHECK-MVE-NEXT: vrintx.f16 s0, s0
; CHECK-MVE-NEXT: vmov r0, s0
; CHECK-MVE-NEXT: vmov.16 q1[7], r0
; CHECK-MVE-NEXT: vmov q0, q1
@ -313,44 +277,32 @@ entry:
define arm_aapcs_vfpcc <8 x half> @fnearbyint_float16_t(<8 x half> %src) {
; CHECK-LABEL: fnearbyint_float16_t:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.u16 r0, q0[0]
; CHECK-NEXT: vmov.u16 r1, q0[1]
; CHECK-NEXT: vmov s4, r0
; CHECK-NEXT: vmovx.f16 s4, s0
; CHECK-NEXT: vrintr.f16 s8, s1
; CHECK-NEXT: vrintr.f16 s4, s4
; CHECK-NEXT: vmov r0, s4
; CHECK-NEXT: vmov s4, r1
; CHECK-NEXT: vrintr.f16 s4, s4
; CHECK-NEXT: vrintr.f16 s4, s0
; CHECK-NEXT: vmov r1, s4
; CHECK-NEXT: vmov.16 q1[0], r0
; CHECK-NEXT: vmov.u16 r0, q0[2]
; CHECK-NEXT: vmov.16 q1[1], r1
; CHECK-NEXT: vmov s8, r0
; CHECK-NEXT: vrintr.f16 s8, s8
; CHECK-NEXT: vmovx.f16 s0, s3
; CHECK-NEXT: vmov.16 q1[0], r1
; CHECK-NEXT: vrintr.f16 s0, s0
; CHECK-NEXT: vmov.16 q1[1], r0
; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vmovx.f16 s8, s1
; CHECK-NEXT: vmov.16 q1[2], r0
; CHECK-NEXT: vmov.u16 r0, q0[3]
; CHECK-NEXT: vmov s8, r0
; CHECK-NEXT: vrintr.f16 s8, s8
; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vrintr.f16 s8, s2
; CHECK-NEXT: vmov.16 q1[3], r0
; CHECK-NEXT: vmov.u16 r0, q0[4]
; CHECK-NEXT: vmov s8, r0
; CHECK-NEXT: vrintr.f16 s8, s8
; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vmovx.f16 s8, s2
; CHECK-NEXT: vmov.16 q1[4], r0
; CHECK-NEXT: vmov.u16 r0, q0[5]
; CHECK-NEXT: vmov s8, r0
; CHECK-NEXT: vrintr.f16 s8, s8
; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vrintr.f16 s8, s3
; CHECK-NEXT: vmov.16 q1[5], r0
; CHECK-NEXT: vmov.u16 r0, q0[6]
; CHECK-NEXT: vmov s8, r0
; CHECK-NEXT: vrintr.f16 s8, s8
; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vmov.16 q1[6], r0
; CHECK-NEXT: vmov.u16 r0, q0[7]
; CHECK-NEXT: vmov s0, r0
; CHECK-NEXT: vrintr.f16 s0, s0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vmov.16 q1[7], r0
; CHECK-NEXT: vmov q0, q1
@ -406,44 +358,32 @@ entry:
define arm_aapcs_vfpcc <8 x half> @ffloor_float16_t(<8 x half> %src) {
; CHECK-MVE-LABEL: ffloor_float16_t:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov.u16 r0, q0[0]
; CHECK-MVE-NEXT: vmov.u16 r1, q0[1]
; CHECK-MVE-NEXT: vmov s4, r0
; CHECK-MVE-NEXT: vmovx.f16 s4, s0
; CHECK-MVE-NEXT: vrintm.f16 s8, s1
; CHECK-MVE-NEXT: vrintm.f16 s4, s4
; CHECK-MVE-NEXT: vmov r0, s4
; CHECK-MVE-NEXT: vmov s4, r1
; CHECK-MVE-NEXT: vrintm.f16 s4, s4
; CHECK-MVE-NEXT: vrintm.f16 s4, s0
; CHECK-MVE-NEXT: vmov r1, s4
; CHECK-MVE-NEXT: vmov.16 q1[0], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[2]
; CHECK-MVE-NEXT: vmov.16 q1[1], r1
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrintm.f16 s8, s8
; CHECK-MVE-NEXT: vmovx.f16 s0, s3
; CHECK-MVE-NEXT: vmov.16 q1[0], r1
; CHECK-MVE-NEXT: vrintm.f16 s0, s0
; CHECK-MVE-NEXT: vmov.16 q1[1], r0
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmovx.f16 s8, s1
; CHECK-MVE-NEXT: vmov.16 q1[2], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[3]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrintm.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vrintm.f16 s8, s2
; CHECK-MVE-NEXT: vmov.16 q1[3], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[4]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrintm.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmovx.f16 s8, s2
; CHECK-MVE-NEXT: vmov.16 q1[4], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[5]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrintm.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vrintm.f16 s8, s3
; CHECK-MVE-NEXT: vmov.16 q1[5], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[6]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrintm.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmov.16 q1[6], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[7]
; CHECK-MVE-NEXT: vmov s0, r0
; CHECK-MVE-NEXT: vrintm.f16 s0, s0
; CHECK-MVE-NEXT: vmov r0, s0
; CHECK-MVE-NEXT: vmov.16 q1[7], r0
; CHECK-MVE-NEXT: vmov q0, q1
@ -504,44 +444,32 @@ entry:
define arm_aapcs_vfpcc <8 x half> @fround_float16_t(<8 x half> %src) {
; CHECK-MVE-LABEL: fround_float16_t:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov.u16 r0, q0[0]
; CHECK-MVE-NEXT: vmov.u16 r1, q0[1]
; CHECK-MVE-NEXT: vmov s4, r0
; CHECK-MVE-NEXT: vmovx.f16 s4, s0
; CHECK-MVE-NEXT: vrinta.f16 s8, s1
; CHECK-MVE-NEXT: vrinta.f16 s4, s4
; CHECK-MVE-NEXT: vmov r0, s4
; CHECK-MVE-NEXT: vmov s4, r1
; CHECK-MVE-NEXT: vrinta.f16 s4, s4
; CHECK-MVE-NEXT: vrinta.f16 s4, s0
; CHECK-MVE-NEXT: vmov r1, s4
; CHECK-MVE-NEXT: vmov.16 q1[0], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[2]
; CHECK-MVE-NEXT: vmov.16 q1[1], r1
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrinta.f16 s8, s8
; CHECK-MVE-NEXT: vmovx.f16 s0, s3
; CHECK-MVE-NEXT: vmov.16 q1[0], r1
; CHECK-MVE-NEXT: vrinta.f16 s0, s0
; CHECK-MVE-NEXT: vmov.16 q1[1], r0
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmovx.f16 s8, s1
; CHECK-MVE-NEXT: vmov.16 q1[2], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[3]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrinta.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vrinta.f16 s8, s2
; CHECK-MVE-NEXT: vmov.16 q1[3], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[4]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrinta.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmovx.f16 s8, s2
; CHECK-MVE-NEXT: vmov.16 q1[4], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[5]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrinta.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vrinta.f16 s8, s3
; CHECK-MVE-NEXT: vmov.16 q1[5], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[6]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vrinta.f16 s8, s8
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmov.16 q1[6], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[7]
; CHECK-MVE-NEXT: vmov s0, r0
; CHECK-MVE-NEXT: vrinta.f16 s0, s0
; CHECK-MVE-NEXT: vmov r0, s0
; CHECK-MVE-NEXT: vmov.16 q1[7], r0
; CHECK-MVE-NEXT: vmov q0, q1

View File

@ -331,60 +331,36 @@ entry:
define arm_aapcs_vfpcc <8 x half> @minnm_float16_t(<8 x half> %src1, <8 x half> %src2) {
; CHECK-MVE-LABEL: minnm_float16_t:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov.u16 r0, q0[0]
; CHECK-MVE-NEXT: vmov.u16 r1, q0[1]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[0]
; CHECK-MVE-NEXT: vmov s10, r0
; CHECK-MVE-NEXT: vmov.u16 r2, q1[1]
; CHECK-MVE-NEXT: vminnm.f16 s8, s10, s8
; CHECK-MVE-NEXT: vmov s10, r2
; CHECK-MVE-NEXT: vminnm.f16 s8, s4, s0
; CHECK-MVE-NEXT: vmovx.f16 s10, s4
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmov s8, r1
; CHECK-MVE-NEXT: vmovx.f16 s8, s0
; CHECK-MVE-NEXT: vminnm.f16 s8, s10, s8
; CHECK-MVE-NEXT: vminnm.f16 s12, s5, s1
; CHECK-MVE-NEXT: vmov r1, s8
; CHECK-MVE-NEXT: vmov.16 q2[0], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[2]
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vmovx.f16 s12, s1
; CHECK-MVE-NEXT: vmovx.f16 s14, s5
; CHECK-MVE-NEXT: vmov.16 q2[1], r1
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[2]
; CHECK-MVE-NEXT: vmov s14, r0
; CHECK-MVE-NEXT: vminnm.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vmov.16 q2[2], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[3]
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[3]
; CHECK-MVE-NEXT: vmov s14, r0
; CHECK-MVE-NEXT: vminnm.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vminnm.f16 s12, s6, s2
; CHECK-MVE-NEXT: vmov.16 q2[3], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[4]
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[4]
; CHECK-MVE-NEXT: vmov s14, r0
; CHECK-MVE-NEXT: vminnm.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vmovx.f16 s12, s2
; CHECK-MVE-NEXT: vmovx.f16 s14, s6
; CHECK-MVE-NEXT: vminnm.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov.16 q2[4], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[5]
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[5]
; CHECK-MVE-NEXT: vmov s14, r0
; CHECK-MVE-NEXT: vminnm.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vmovx.f16 s0, s3
; CHECK-MVE-NEXT: vmovx.f16 s2, s7
; CHECK-MVE-NEXT: vminnm.f16 s12, s7, s3
; CHECK-MVE-NEXT: vmov.16 q2[5], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[6]
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[6]
; CHECK-MVE-NEXT: vmov s14, r0
; CHECK-MVE-NEXT: vminnm.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vmov.16 q2[6], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[7]
; CHECK-MVE-NEXT: vmov s0, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[7]
; CHECK-MVE-NEXT: vmov s2, r0
; CHECK-MVE-NEXT: vminnm.f16 s0, s2, s0
; CHECK-MVE-NEXT: vmov.16 q2[6], r0
; CHECK-MVE-NEXT: vmov r0, s0
; CHECK-MVE-NEXT: vmov.16 q2[7], r0
; CHECK-MVE-NEXT: vmov q0, q2

View File

@ -348,21 +348,25 @@ entry:
define arm_aapcs_vfpcc <8 x half> @shuffle1_f16(<8 x half> %src) {
; CHECK-LABEL: shuffle1_f16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.u16 r0, q0[7]
; CHECK-NEXT: vmov.u16 r1, q0[6]
; CHECK-NEXT: vmov.16 q1[0], r0
; CHECK-NEXT: vmov.u16 r0, q0[5]
; CHECK-NEXT: vmov.16 q1[1], r1
; CHECK-NEXT: vmovx.f16 s4, s3
; CHECK-NEXT: vmov r0, s3
; CHECK-NEXT: vmov r1, s4
; CHECK-NEXT: vmovx.f16 s8, s2
; CHECK-NEXT: vmov.16 q1[0], r1
; CHECK-NEXT: vmov.16 q1[1], r0
; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vmov.16 q1[2], r0
; CHECK-NEXT: vmov.u16 r0, q0[4]
; CHECK-NEXT: vmov r0, s2
; CHECK-NEXT: vmovx.f16 s8, s1
; CHECK-NEXT: vmov.16 q1[3], r0
; CHECK-NEXT: vmov.u16 r0, q0[3]
; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vmovx.f16 s8, s0
; CHECK-NEXT: vmov.16 q1[4], r0
; CHECK-NEXT: vmov.u16 r0, q0[2]
; CHECK-NEXT: vmov r0, s1
; CHECK-NEXT: vmov.16 q1[5], r0
; CHECK-NEXT: vmov.u16 r0, q0[1]
; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vmov.16 q1[6], r0
; CHECK-NEXT: vmov.u16 r0, q0[0]
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vmov.16 q1[7], r0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
@ -383,21 +387,25 @@ entry:
define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) {
; CHECK-LABEL: shuffle3_f16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.u16 r0, q0[4]
; CHECK-NEXT: vmov.u16 r1, q0[5]
; CHECK-NEXT: vmov.16 q1[0], r0
; CHECK-NEXT: vmov.u16 r0, q0[7]
; CHECK-NEXT: vmov.16 q1[1], r1
; CHECK-NEXT: vmovx.f16 s4, s2
; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: vmov r0, s4
; CHECK-NEXT: vmov.16 q1[0], r1
; CHECK-NEXT: vmovx.f16 s8, s3
; CHECK-NEXT: vmov.16 q1[1], r0
; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vmovx.f16 s8, s1
; CHECK-NEXT: vmov.16 q1[2], r0
; CHECK-NEXT: vmov.u16 r0, q0[6]
; CHECK-NEXT: vmov r0, s3
; CHECK-NEXT: vmov.16 q1[3], r0
; CHECK-NEXT: vmov.u16 r0, q0[3]
; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vmovx.f16 s8, s0
; CHECK-NEXT: vmov.16 q1[4], r0
; CHECK-NEXT: vmov.u16 r0, q0[1]
; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vmov.16 q1[5], r0
; CHECK-NEXT: vmov.u16 r0, q0[2]
; CHECK-NEXT: vmov r0, s1
; CHECK-NEXT: vmov.16 q1[6], r0
; CHECK-NEXT: vmov.u16 r0, q0[0]
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vmov.16 q1[7], r0
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: bx lr
@ -681,8 +689,6 @@ entry:
define arm_aapcs_vfpcc half @extract_f16_0(<8 x half> %a) {
; CHECK-LABEL: extract_f16_0:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.u16 r1, q0[0]
; CHECK-NEXT: vmov s0, r1
; CHECK-NEXT: vstr.16 s0, [r0]
; CHECK-NEXT: bx lr
entry:
@ -693,8 +699,7 @@ entry:
define arm_aapcs_vfpcc half @extract_f16_3(<8 x half> %a) {
; CHECK-LABEL: extract_f16_3:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.u16 r1, q0[3]
; CHECK-NEXT: vmov s0, r1
; CHECK-NEXT: vmovx.f16 s0, s1
; CHECK-NEXT: vstr.16 s0, [r0]
; CHECK-NEXT: bx lr
entry:

View File

@ -81,60 +81,36 @@ entry:
define arm_aapcs_vfpcc <8 x half> @add_float16_t(<8 x half> %src1, <8 x half> %src2) {
; CHECK-MVE-LABEL: add_float16_t:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov.u16 r0, q0[0]
; CHECK-MVE-NEXT: vmov.u16 r1, q0[1]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[0]
; CHECK-MVE-NEXT: vmov s10, r0
; CHECK-MVE-NEXT: vmov.u16 r2, q1[1]
; CHECK-MVE-NEXT: vadd.f16 s8, s10, s8
; CHECK-MVE-NEXT: vmov s10, r2
; CHECK-MVE-NEXT: vadd.f16 s8, s4, s0
; CHECK-MVE-NEXT: vmovx.f16 s10, s4
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmov s8, r1
; CHECK-MVE-NEXT: vmovx.f16 s8, s0
; CHECK-MVE-NEXT: vadd.f16 s8, s10, s8
; CHECK-MVE-NEXT: vadd.f16 s12, s5, s1
; CHECK-MVE-NEXT: vmov r1, s8
; CHECK-MVE-NEXT: vmov.16 q2[0], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[2]
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vmovx.f16 s12, s1
; CHECK-MVE-NEXT: vmovx.f16 s14, s5
; CHECK-MVE-NEXT: vmov.16 q2[1], r1
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[2]
; CHECK-MVE-NEXT: vmov s14, r0
; CHECK-MVE-NEXT: vadd.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vmov.16 q2[2], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[3]
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[3]
; CHECK-MVE-NEXT: vmov s14, r0
; CHECK-MVE-NEXT: vadd.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vadd.f16 s12, s6, s2
; CHECK-MVE-NEXT: vmov.16 q2[3], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[4]
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[4]
; CHECK-MVE-NEXT: vmov s14, r0
; CHECK-MVE-NEXT: vadd.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vmovx.f16 s12, s2
; CHECK-MVE-NEXT: vmovx.f16 s14, s6
; CHECK-MVE-NEXT: vadd.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov.16 q2[4], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[5]
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[5]
; CHECK-MVE-NEXT: vmov s14, r0
; CHECK-MVE-NEXT: vadd.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vmovx.f16 s0, s3
; CHECK-MVE-NEXT: vmovx.f16 s2, s7
; CHECK-MVE-NEXT: vadd.f16 s12, s7, s3
; CHECK-MVE-NEXT: vmov.16 q2[5], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[6]
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[6]
; CHECK-MVE-NEXT: vmov s14, r0
; CHECK-MVE-NEXT: vadd.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vmov.16 q2[6], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[7]
; CHECK-MVE-NEXT: vmov s0, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[7]
; CHECK-MVE-NEXT: vmov s2, r0
; CHECK-MVE-NEXT: vadd.f16 s0, s2, s0
; CHECK-MVE-NEXT: vmov.16 q2[6], r0
; CHECK-MVE-NEXT: vmov r0, s0
; CHECK-MVE-NEXT: vmov.16 q2[7], r0
; CHECK-MVE-NEXT: vmov q0, q2
@ -256,60 +232,36 @@ entry:
define arm_aapcs_vfpcc <8 x half> @sub_float16_t(<8 x half> %src1, <8 x half> %src2) {
; CHECK-MVE-LABEL: sub_float16_t:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov.u16 r0, q0[0]
; CHECK-MVE-NEXT: vmov.u16 r1, q0[1]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[0]
; CHECK-MVE-NEXT: vmov s10, r0
; CHECK-MVE-NEXT: vmov.u16 r2, q1[1]
; CHECK-MVE-NEXT: vsub.f16 s8, s10, s8
; CHECK-MVE-NEXT: vmov s10, r2
; CHECK-MVE-NEXT: vsub.f16 s8, s4, s0
; CHECK-MVE-NEXT: vmovx.f16 s10, s4
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmov s8, r1
; CHECK-MVE-NEXT: vmovx.f16 s8, s0
; CHECK-MVE-NEXT: vsub.f16 s8, s10, s8
; CHECK-MVE-NEXT: vsub.f16 s12, s5, s1
; CHECK-MVE-NEXT: vmov r1, s8
; CHECK-MVE-NEXT: vmov.16 q2[0], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[2]
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vmovx.f16 s12, s1
; CHECK-MVE-NEXT: vmovx.f16 s14, s5
; CHECK-MVE-NEXT: vmov.16 q2[1], r1
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[2]
; CHECK-MVE-NEXT: vmov s14, r0
; CHECK-MVE-NEXT: vsub.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vmov.16 q2[2], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[3]
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[3]
; CHECK-MVE-NEXT: vmov s14, r0
; CHECK-MVE-NEXT: vsub.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vsub.f16 s12, s6, s2
; CHECK-MVE-NEXT: vmov.16 q2[3], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[4]
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[4]
; CHECK-MVE-NEXT: vmov s14, r0
; CHECK-MVE-NEXT: vsub.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vmovx.f16 s12, s2
; CHECK-MVE-NEXT: vmovx.f16 s14, s6
; CHECK-MVE-NEXT: vsub.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov.16 q2[4], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[5]
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[5]
; CHECK-MVE-NEXT: vmov s14, r0
; CHECK-MVE-NEXT: vsub.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vmovx.f16 s0, s3
; CHECK-MVE-NEXT: vmovx.f16 s2, s7
; CHECK-MVE-NEXT: vsub.f16 s12, s7, s3
; CHECK-MVE-NEXT: vmov.16 q2[5], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[6]
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[6]
; CHECK-MVE-NEXT: vmov s14, r0
; CHECK-MVE-NEXT: vsub.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vmov.16 q2[6], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[7]
; CHECK-MVE-NEXT: vmov s0, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[7]
; CHECK-MVE-NEXT: vmov s2, r0
; CHECK-MVE-NEXT: vsub.f16 s0, s2, s0
; CHECK-MVE-NEXT: vmov.16 q2[6], r0
; CHECK-MVE-NEXT: vmov r0, s0
; CHECK-MVE-NEXT: vmov.16 q2[7], r0
; CHECK-MVE-NEXT: vmov q0, q2
@ -414,60 +366,36 @@ entry:
define arm_aapcs_vfpcc <8 x half> @mul_float16_t(<8 x half> %src1, <8 x half> %src2) {
; CHECK-MVE-LABEL: mul_float16_t:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov.u16 r0, q0[0]
; CHECK-MVE-NEXT: vmov.u16 r1, q0[1]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[0]
; CHECK-MVE-NEXT: vmov s10, r0
; CHECK-MVE-NEXT: vmov.u16 r2, q1[1]
; CHECK-MVE-NEXT: vmul.f16 s8, s10, s8
; CHECK-MVE-NEXT: vmov s10, r2
; CHECK-MVE-NEXT: vmul.f16 s8, s4, s0
; CHECK-MVE-NEXT: vmovx.f16 s10, s4
; CHECK-MVE-NEXT: vmov r0, s8
; CHECK-MVE-NEXT: vmov s8, r1
; CHECK-MVE-NEXT: vmovx.f16 s8, s0
; CHECK-MVE-NEXT: vmul.f16 s8, s10, s8
; CHECK-MVE-NEXT: vmul.f16 s12, s5, s1
; CHECK-MVE-NEXT: vmov r1, s8
; CHECK-MVE-NEXT: vmov.16 q2[0], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[2]
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vmovx.f16 s12, s1
; CHECK-MVE-NEXT: vmovx.f16 s14, s5
; CHECK-MVE-NEXT: vmov.16 q2[1], r1
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[2]
; CHECK-MVE-NEXT: vmov s14, r0
; CHECK-MVE-NEXT: vmul.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vmov.16 q2[2], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[3]
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[3]
; CHECK-MVE-NEXT: vmov s14, r0
; CHECK-MVE-NEXT: vmul.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vmul.f16 s12, s6, s2
; CHECK-MVE-NEXT: vmov.16 q2[3], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[4]
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[4]
; CHECK-MVE-NEXT: vmov s14, r0
; CHECK-MVE-NEXT: vmul.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vmovx.f16 s12, s2
; CHECK-MVE-NEXT: vmovx.f16 s14, s6
; CHECK-MVE-NEXT: vmul.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov.16 q2[4], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[5]
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[5]
; CHECK-MVE-NEXT: vmov s14, r0
; CHECK-MVE-NEXT: vmul.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vmovx.f16 s0, s3
; CHECK-MVE-NEXT: vmovx.f16 s2, s7
; CHECK-MVE-NEXT: vmul.f16 s12, s7, s3
; CHECK-MVE-NEXT: vmov.16 q2[5], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[6]
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[6]
; CHECK-MVE-NEXT: vmov s14, r0
; CHECK-MVE-NEXT: vmul.f16 s12, s14, s12
; CHECK-MVE-NEXT: vmov r0, s12
; CHECK-MVE-NEXT: vmov.16 q2[6], r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[7]
; CHECK-MVE-NEXT: vmov s0, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q1[7]
; CHECK-MVE-NEXT: vmov s2, r0
; CHECK-MVE-NEXT: vmul.f16 s0, s2, s0
; CHECK-MVE-NEXT: vmov.16 q2[6], r0
; CHECK-MVE-NEXT: vmov r0, s0
; CHECK-MVE-NEXT: vmov.16 q2[7], r0
; CHECK-MVE-NEXT: vmov q0, q2

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -213,29 +213,17 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @foo_int16_half(<8 x half> %src) {
; CHECK-MVE-LABEL: foo_int16_half:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov.u16 r0, q0[7]
; CHECK-MVE-NEXT: vmov s4, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[5]
; CHECK-MVE-NEXT: vmov s6, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[6]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[3]
; CHECK-MVE-NEXT: vmov s10, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[4]
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[1]
; CHECK-MVE-NEXT: vmov s14, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[2]
; CHECK-MVE-NEXT: vmov s5, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[0]
; CHECK-MVE-NEXT: vmov s0, r0
; CHECK-MVE-NEXT: vmovx.f16 s4, s3
; CHECK-MVE-NEXT: vmovx.f16 s6, s2
; CHECK-MVE-NEXT: vmovx.f16 s10, s1
; CHECK-MVE-NEXT: vmovx.f16 s14, s0
; CHECK-MVE-NEXT: vcvt.s32.f16 s4, s4
; CHECK-MVE-NEXT: vcvt.s32.f16 s6, s6
; CHECK-MVE-NEXT: vcvt.s32.f16 s8, s8
; CHECK-MVE-NEXT: vcvt.s32.f16 s8, s3
; CHECK-MVE-NEXT: vcvt.s32.f16 s10, s10
; CHECK-MVE-NEXT: vcvt.s32.f16 s12, s12
; CHECK-MVE-NEXT: vcvt.s32.f16 s12, s2
; CHECK-MVE-NEXT: vcvt.s32.f16 s14, s14
; CHECK-MVE-NEXT: vcvt.s32.f16 s5, s5
; CHECK-MVE-NEXT: vcvt.s32.f16 s5, s1
; CHECK-MVE-NEXT: vcvt.s32.f16 s0, s0
; CHECK-MVE-NEXT: vmov r0, s0
; CHECK-MVE-NEXT: vmov.16 q0[0], r0
@ -267,29 +255,17 @@ entry:
define arm_aapcs_vfpcc <8 x i16> @foo_uint16_half(<8 x half> %src) {
; CHECK-MVE-LABEL: foo_uint16_half:
; CHECK-MVE: @ %bb.0: @ %entry
; CHECK-MVE-NEXT: vmov.u16 r0, q0[7]
; CHECK-MVE-NEXT: vmov s4, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[5]
; CHECK-MVE-NEXT: vmov s6, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[6]
; CHECK-MVE-NEXT: vmov s8, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[3]
; CHECK-MVE-NEXT: vmov s10, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[4]
; CHECK-MVE-NEXT: vmov s12, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[1]
; CHECK-MVE-NEXT: vmov s14, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[2]
; CHECK-MVE-NEXT: vmov s5, r0
; CHECK-MVE-NEXT: vmov.u16 r0, q0[0]
; CHECK-MVE-NEXT: vmov s0, r0
; CHECK-MVE-NEXT: vmovx.f16 s4, s3
; CHECK-MVE-NEXT: vmovx.f16 s6, s2
; CHECK-MVE-NEXT: vmovx.f16 s10, s1
; CHECK-MVE-NEXT: vmovx.f16 s14, s0
; CHECK-MVE-NEXT: vcvt.s32.f16 s4, s4
; CHECK-MVE-NEXT: vcvt.s32.f16 s6, s6
; CHECK-MVE-NEXT: vcvt.s32.f16 s8, s8
; CHECK-MVE-NEXT: vcvt.s32.f16 s8, s3
; CHECK-MVE-NEXT: vcvt.s32.f16 s10, s10
; CHECK-MVE-NEXT: vcvt.s32.f16 s12, s12
; CHECK-MVE-NEXT: vcvt.s32.f16 s12, s2
; CHECK-MVE-NEXT: vcvt.s32.f16 s14, s14
; CHECK-MVE-NEXT: vcvt.s32.f16 s5, s5
; CHECK-MVE-NEXT: vcvt.s32.f16 s5, s1
; CHECK-MVE-NEXT: vcvt.s32.f16 s0, s0
; CHECK-MVE-NEXT: vmov r0, s0
; CHECK-MVE-NEXT: vmov.16 q0[0], r0

View File

@ -1,3 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp %s -o - | FileCheck %s
define arm_aapcs_vfpcc <4 x float> @fma_v4f32(<4 x float> %dst, <4 x float> %s1, <4 x float> %s2) {