1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-26 04:32:44 +01:00
llvm-mirror/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
Tomas Matheson bb84fe4048 [CodeGen][regalloc] Don't align stack slots if the stack can't be realigned
Register allocation may spill virtual registers to the stack, which can
increase alignment requirements of the stack frame. If the the function
did not require stack realignment before register allocation, the
registers required to do so may not be reserved/available. This results
in a stack frame that requires realignment but can not be realigned.

Instead, only increase the alignment of the stack if we are still able
to realign.

The register SpillAlignment will be ignored if we can't realign, and the
backend will be responsible for emitting the correct unaligned loads and
stores. This seems to be the assumed behaviour already, e.g.
ARMBaseInstrInfo::storeRegToStackSlot and X86InstrInfo::storeRegToStackSlot
are both `canRealignStack` aware.

Differential Revision: https://reviews.llvm.org/D103602
2021-06-11 16:49:12 +01:00

2936 lines
102 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
define arm_aapcs_vfpcc i32 @add_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b) {
; CHECK-LABEL: add_v4i32_v4i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i32 eq, q2, zr
; CHECK-NEXT: vmlavt.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <4 x i32> %b, zeroinitializer
%m = mul <4 x i32> %x, %y
%s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
ret i32 %z
}
define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b) {
; CHECK-LABEL: add_v4i32_v4i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i32 eq, q2, zr
; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <4 x i32> %b, zeroinitializer
%xx = zext <4 x i32> %x to <4 x i64>
%yy = zext <4 x i32> %y to <4 x i64>
%m = mul <4 x i64> %xx, %yy
%s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b) {
; CHECK-LABEL: add_v4i32_v4i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i32 eq, q2, zr
; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <4 x i32> %b, zeroinitializer
%xx = sext <4 x i32> %x to <4 x i64>
%yy = sext <4 x i32> %y to <4 x i64>
%m = mul <4 x i64> %xx, %yy
%s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b) {
; CHECK-LABEL: add_v2i32_v2i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r0, s10
; CHECK-NEXT: vmullb.u32 q3, q0, q1
; CHECK-NEXT: vmov r1, s8
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: cset r0, eq
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: csetm r0, ne
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: cset r1, eq
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: csetm r1, ne
; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
; CHECK-NEXT: vand q0, q3, q0
; CHECK-NEXT: vmov r0, r1, d1
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <2 x i32> %b, zeroinitializer
%xx = zext <2 x i32> %x to <2 x i64>
%yy = zext <2 x i32> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b) {
; CHECK-LABEL: add_v2i32_v2i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r0, s10
; CHECK-NEXT: vmullb.s32 q3, q0, q1
; CHECK-NEXT: vmov r1, s8
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: cset r0, eq
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: csetm r0, ne
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: cset r1, eq
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: csetm r1, ne
; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
; CHECK-NEXT: vand q0, q3, q0
; CHECK-NEXT: vmov r0, r1, d1
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <2 x i32> %b, zeroinitializer
%xx = sext <2 x i32> %x to <2 x i64>
%yy = sext <2 x i32> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
ret i64 %z
}
define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
; CHECK-LABEL: add_v8i16_v8i32_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i16 eq, q2, zr
; CHECK-NEXT: vmlavt.u16 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i16> %b, zeroinitializer
%xx = zext <8 x i16> %x to <8 x i32>
%yy = zext <8 x i16> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
; CHECK-LABEL: add_v8i16_v8i32_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i16 eq, q2, zr
; CHECK-NEXT: vmlavt.s16 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i16> %b, zeroinitializer
%xx = sext <8 x i16> %x to <8 x i32>
%yy = sext <8 x i16> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) {
; CHECK-LABEL: add_v4i16_v4i32_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u16 q1, q1
; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vmovlb.u16 q2, q2
; CHECK-NEXT: vpt.i32 eq, q2, zr
; CHECK-NEXT: vmlavt.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <4 x i16> %b, zeroinitializer
%xx = zext <4 x i16> %x to <4 x i32>
%yy = zext <4 x i16> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) {
; CHECK-LABEL: add_v4i16_v4i32_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s16 q1, q1
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmovlb.u16 q2, q2
; CHECK-NEXT: vpt.i32 eq, q2, zr
; CHECK-NEXT: vmlavt.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <4 x i16> %b, zeroinitializer
%xx = sext <4 x i16> %x to <4 x i32>
%yy = sext <4 x i16> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
ret i32 %z
}
define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
; CHECK-LABEL: add_v8i16_v8i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i16 eq, q2, zr
; CHECK-NEXT: vmlavt.u16 r0, q0, q1
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i16> %b, zeroinitializer
%m = mul <8 x i16> %x, %y
%s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
ret i16 %z
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
; CHECK-LABEL: add_v8i16_v8i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i16 eq, q2, zr
; CHECK-NEXT: vmlalvt.u16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i16> %b, zeroinitializer
%xx = zext <8 x i16> %x to <8 x i64>
%yy = zext <8 x i16> %y to <8 x i64>
%m = mul <8 x i64> %xx, %yy
%s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
; CHECK-LABEL: add_v8i16_v8i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i16 eq, q2, zr
; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i16> %b, zeroinitializer
%xx = sext <8 x i16> %x to <8 x i64>
%yy = sext <8 x i16> %y to <8 x i64>
%m = mul <8 x i64> %xx, %yy
%s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
; CHECK-LABEL: add_v8i16_v8i32_v8i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i16 eq, q2, zr
; CHECK-NEXT: vmlalvt.u16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i16> %b, zeroinitializer
%xx = zext <8 x i16> %x to <8 x i32>
%yy = zext <8 x i16> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%ma = zext <8 x i32> %m to <8 x i64>
%s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
; CHECK-LABEL: add_v8i16_v8i32_v8i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i16 eq, q2, zr
; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i16> %b, zeroinitializer
%xx = sext <8 x i16> %x to <8 x i32>
%yy = sext <8 x i16> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%ma = sext <8 x i32> %m to <8 x i64>
%s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sextzext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
; CHECK-LABEL: add_v8i16_v8i32_v8i64_sextzext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i16 eq, q2, zr
; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q0
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i16> %b, zeroinitializer
%xx = sext <8 x i16> %x to <8 x i32>
%m = mul <8 x i32> %xx, %xx
%ma = zext <8 x i32> %m to <8 x i64>
%s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) {
; CHECK-LABEL: add_v4i16_v4i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u16 q1, q1
; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vmovlb.u16 q2, q2
; CHECK-NEXT: vpt.i32 eq, q2, zr
; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <4 x i16> %b, zeroinitializer
%xx = zext <4 x i16> %x to <4 x i64>
%yy = zext <4 x i16> %y to <4 x i64>
%m = mul <4 x i64> %xx, %yy
%s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) {
; CHECK-LABEL: add_v4i16_v4i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s16 q1, q1
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmovlb.u16 q2, q2
; CHECK-NEXT: vpt.i32 eq, q2, zr
; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <4 x i16> %b, zeroinitializer
%xx = sext <4 x i16> %x to <4 x i64>
%yy = sext <4 x i16> %y to <4 x i64>
%m = mul <4 x i64> %xx, %yy
%s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b) {
; CHECK-LABEL: add_v2i16_v2i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i64 q3, #0xffff
; CHECK-NEXT: vand q1, q1, q3
; CHECK-NEXT: vand q0, q0, q3
; CHECK-NEXT: vmov r0, s6
; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: vmov r2, s4
; CHECK-NEXT: vand q1, q2, q3
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: umull r0, r1, r1, r0
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
; CHECK-NEXT: vmov r0, s6
; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
; CHECK-NEXT: vmov r1, s4
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: cset r0, eq
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: csetm r0, ne
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: cset r1, eq
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: csetm r1, ne
; CHECK-NEXT: vmov q1[2], q1[0], r1, r0
; CHECK-NEXT: vmov q1[3], q1[1], r1, r0
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vmov r0, r1, d1
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <2 x i16> %b, zeroinitializer
%xx = zext <2 x i16> %x to <2 x i64>
%yy = zext <2 x i16> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b) {
; CHECK-LABEL: add_v2i16_v2i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i32 q3, #0xffff
; CHECK-NEXT: vmov r2, s4
; CHECK-NEXT: vand q2, q2, q3
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: vmov r0, s10
; CHECK-NEXT: vmov r1, s8
; CHECK-NEXT: sxth r2, r2
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: sxth r3, r3
; CHECK-NEXT: cset r0, eq
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: csetm r0, ne
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: cset r1, eq
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: csetm r1, ne
; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
; CHECK-NEXT: vmov q2[3], q2[1], r1, r0
; CHECK-NEXT: vmov r0, s6
; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: sxth r1, r1
; CHECK-NEXT: smull r0, r1, r1, r0
; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmov r0, r1, d1
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <2 x i16> %b, zeroinitializer
%xx = sext <2 x i16> %x to <2 x i64>
%yy = sext <2 x i16> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
ret i64 %z
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
; CHECK-LABEL: add_v16i8_v16i32_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i8 eq, q2, zr
; CHECK-NEXT: vmlavt.u8 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <16 x i8> %b, zeroinitializer
%xx = zext <16 x i8> %x to <16 x i32>
%yy = zext <16 x i8> %y to <16 x i32>
%m = mul <16 x i32> %xx, %yy
%s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
; CHECK-LABEL: add_v16i8_v16i32_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i8 eq, q2, zr
; CHECK-NEXT: vmlavt.s8 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <16 x i8> %b, zeroinitializer
%xx = sext <16 x i8> %x to <16 x i32>
%yy = sext <16 x i8> %y to <16 x i32>
%m = mul <16 x i32> %xx, %yy
%s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
; CHECK-LABEL: add_v16i8_v16i16_v16i32_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i8 eq, q2, zr
; CHECK-NEXT: vmlavt.u8 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <16 x i8> %b, zeroinitializer
%xx = zext <16 x i8> %x to <16 x i16>
%yy = zext <16 x i8> %y to <16 x i16>
%m = mul <16 x i16> %xx, %yy
%ma = zext <16 x i16> %m to <16 x i32>
%s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
; CHECK-LABEL: add_v16i8_v16i16_v16i32_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i8 eq, q2, zr
; CHECK-NEXT: vmlavt.s8 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <16 x i8> %b, zeroinitializer
%xx = sext <16 x i8> %x to <16 x i16>
%yy = sext <16 x i8> %y to <16 x i16>
%m = mul <16 x i16> %xx, %yy
%ma = sext <16 x i16> %m to <16 x i32>
%s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sextzext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
; CHECK-LABEL: add_v16i8_v16i16_v16i32_sextzext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i8 eq, q2, zr
; CHECK-NEXT: vmlavt.s8 r0, q0, q0
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <16 x i8> %b, zeroinitializer
%xx = sext <16 x i8> %x to <16 x i16>
%m = mul <16 x i16> %xx, %xx
%ma = zext <16 x i16> %m to <16 x i32>
%s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
; CHECK-LABEL: add_v8i8_v8i32_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: vmovlb.u8 q1, q1
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: vmov.u16 r0, q1[2]
; CHECK-NEXT: vmov.u16 r1, q1[0]
; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
; CHECK-NEXT: vmov.u16 r0, q1[3]
; CHECK-NEXT: vmov.u16 r1, q1[1]
; CHECK-NEXT: vmovlb.u8 q2, q2
; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
; CHECK-NEXT: vmov.u16 r0, q0[2]
; CHECK-NEXT: vmov.u16 r1, q0[0]
; CHECK-NEXT: vmovlb.u16 q4, q3
; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
; CHECK-NEXT: vmov.u16 r0, q0[3]
; CHECK-NEXT: vmov.u16 r1, q0[1]
; CHECK-NEXT: vcmp.i16 eq, q2, zr
; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
; CHECK-NEXT: vmov.i8 q2, #0x0
; CHECK-NEXT: vmovlb.u16 q5, q3
; CHECK-NEXT: vmov.i8 q3, #0xff
; CHECK-NEXT: vpsel q2, q3, q2
; CHECK-NEXT: vmov.u16 r0, q2[2]
; CHECK-NEXT: vmov.u16 r1, q2[0]
; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
; CHECK-NEXT: vmov.u16 r0, q2[3]
; CHECK-NEXT: vmov.u16 r1, q2[1]
; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
; CHECK-NEXT: vmov.u16 r0, q1[6]
; CHECK-NEXT: vcmp.i32 ne, q3, zr
; CHECK-NEXT: vmov.i32 q3, #0x0
; CHECK-NEXT: vmov.u16 r1, q1[4]
; CHECK-NEXT: vpst
; CHECK-NEXT: vmult.i32 q3, q5, q4
; CHECK-NEXT: vmov q4[2], q4[0], r1, r0
; CHECK-NEXT: vmov.u16 r0, q1[7]
; CHECK-NEXT: vmov.u16 r1, q1[5]
; CHECK-NEXT: vmov q4[3], q4[1], r1, r0
; CHECK-NEXT: vmov.u16 r0, q0[6]
; CHECK-NEXT: vmov.u16 r1, q0[4]
; CHECK-NEXT: vmov q1[2], q1[0], r1, r0
; CHECK-NEXT: vmov.u16 r0, q0[7]
; CHECK-NEXT: vmov.u16 r1, q0[5]
; CHECK-NEXT: vmov q1[3], q1[1], r1, r0
; CHECK-NEXT: vmov.u16 r0, q2[6]
; CHECK-NEXT: vmov.u16 r1, q2[4]
; CHECK-NEXT: vmullb.u16 q0, q1, q4
; CHECK-NEXT: vmov q1[2], q1[0], r1, r0
; CHECK-NEXT: vmov.u16 r0, q2[7]
; CHECK-NEXT: vmov.u16 r1, q2[5]
; CHECK-NEXT: vmov q1[3], q1[1], r1, r0
; CHECK-NEXT: vpt.i32 ne, q1, zr
; CHECK-NEXT: vaddt.i32 q3, q3, q0
; CHECK-NEXT: vaddv.u32 r0, q3
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i8> %b, zeroinitializer
%xx = zext <8 x i8> %x to <8 x i32>
%yy = zext <8 x i8> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
; CHECK-LABEL: add_v8i8_v8i32_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: vmov.u16 r0, q1[2]
; CHECK-NEXT: vmov.u16 r1, q1[0]
; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
; CHECK-NEXT: vmov.u16 r0, q1[3]
; CHECK-NEXT: vmov.u16 r1, q1[1]
; CHECK-NEXT: vmovlb.u8 q2, q2
; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
; CHECK-NEXT: vmov.u16 r0, q0[2]
; CHECK-NEXT: vmovlb.s8 q3, q3
; CHECK-NEXT: vmov.u16 r1, q0[0]
; CHECK-NEXT: vmovlb.s16 q4, q3
; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
; CHECK-NEXT: vmov.u16 r0, q0[3]
; CHECK-NEXT: vmov.u16 r1, q0[1]
; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
; CHECK-NEXT: vcmp.i16 eq, q2, zr
; CHECK-NEXT: vmovlb.s8 q3, q3
; CHECK-NEXT: vmov.i8 q2, #0x0
; CHECK-NEXT: vmovlb.s16 q5, q3
; CHECK-NEXT: vmov.i8 q3, #0xff
; CHECK-NEXT: vpsel q2, q3, q2
; CHECK-NEXT: vmov.u16 r0, q2[2]
; CHECK-NEXT: vmov.u16 r1, q2[0]
; CHECK-NEXT: vmov q3[2], q3[0], r1, r0
; CHECK-NEXT: vmov.u16 r0, q2[3]
; CHECK-NEXT: vmov.u16 r1, q2[1]
; CHECK-NEXT: vmov q3[3], q3[1], r1, r0
; CHECK-NEXT: vmov.u16 r0, q1[6]
; CHECK-NEXT: vcmp.i32 ne, q3, zr
; CHECK-NEXT: vmov.i32 q3, #0x0
; CHECK-NEXT: vmov.u16 r1, q1[4]
; CHECK-NEXT: vpst
; CHECK-NEXT: vmult.i32 q3, q5, q4
; CHECK-NEXT: vmov q4[2], q4[0], r1, r0
; CHECK-NEXT: vmov.u16 r0, q1[7]
; CHECK-NEXT: vmov.u16 r1, q1[5]
; CHECK-NEXT: vmov q4[3], q4[1], r1, r0
; CHECK-NEXT: vmov.u16 r0, q0[6]
; CHECK-NEXT: vmov.u16 r1, q0[4]
; CHECK-NEXT: vmovlb.s8 q1, q4
; CHECK-NEXT: vmov q4[2], q4[0], r1, r0
; CHECK-NEXT: vmov.u16 r0, q0[7]
; CHECK-NEXT: vmov.u16 r1, q0[5]
; CHECK-NEXT: vmovlb.s16 q1, q1
; CHECK-NEXT: vmov q4[3], q4[1], r1, r0
; CHECK-NEXT: vmov.u16 r0, q2[6]
; CHECK-NEXT: vmovlb.s8 q0, q4
; CHECK-NEXT: vmov.u16 r1, q2[4]
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmul.i32 q0, q0, q1
; CHECK-NEXT: vmov q1[2], q1[0], r1, r0
; CHECK-NEXT: vmov.u16 r0, q2[7]
; CHECK-NEXT: vmov.u16 r1, q2[5]
; CHECK-NEXT: vmov q1[3], q1[1], r1, r0
; CHECK-NEXT: vpt.i32 ne, q1, zr
; CHECK-NEXT: vaddt.i32 q3, q3, q0
; CHECK-NEXT: vaddv.u32 r0, q3
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i8> %b, zeroinitializer
%xx = sext <8 x i8> %x to <8 x i32>
%yy = sext <8 x i8> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) {
; CHECK-LABEL: add_v4i8_v4i32_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i32 q3, #0xff
; CHECK-NEXT: vand q1, q1, q3
; CHECK-NEXT: vand q0, q0, q3
; CHECK-NEXT: vand q2, q2, q3
; CHECK-NEXT: vpt.i32 eq, q2, zr
; CHECK-NEXT: vmlavt.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <4 x i8> %b, zeroinitializer
%xx = zext <4 x i8> %x to <4 x i32>
%yy = zext <4 x i8> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) {
; CHECK-LABEL: add_v4i8_v4i32_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q1, q1
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmov.i32 q3, #0xff
; CHECK-NEXT: vmovlb.s16 q1, q1
; CHECK-NEXT: vand q2, q2, q3
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vpt.i32 eq, q2, zr
; CHECK-NEXT: vmlavt.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <4 x i8> %b, zeroinitializer
%xx = sext <4 x i8> %x to <4 x i32>
%yy = sext <4 x i8> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
ret i32 %z
}
define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
; CHECK-LABEL: add_v16i8_v16i16_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i8 eq, q2, zr
; CHECK-NEXT: vmlavt.u8 r0, q0, q1
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <16 x i8> %b, zeroinitializer
%xx = zext <16 x i8> %x to <16 x i16>
%yy = zext <16 x i8> %y to <16 x i16>
%m = mul <16 x i16> %xx, %yy
%s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
ret i16 %z
}
define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
; CHECK-LABEL: add_v16i8_v16i16_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i8 eq, q2, zr
; CHECK-NEXT: vmlavt.s8 r0, q0, q1
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <16 x i8> %b, zeroinitializer
%xx = sext <16 x i8> %x to <16 x i16>
%yy = sext <16 x i8> %y to <16 x i16>
%m = mul <16 x i16> %xx, %yy
%s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
ret i16 %z
}
define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
; CHECK-LABEL: add_v8i8_v8i16_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u8 q1, q1
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: vmovlb.u8 q2, q2
; CHECK-NEXT: vpt.i16 eq, q2, zr
; CHECK-NEXT: vmlavt.u16 r0, q0, q1
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i8> %b, zeroinitializer
%xx = zext <8 x i8> %x to <8 x i16>
%yy = zext <8 x i8> %y to <8 x i16>
%m = mul <8 x i16> %xx, %yy
%s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
ret i16 %z
}
define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
; CHECK-LABEL: add_v8i8_v8i16_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q1, q1
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmovlb.u8 q2, q2
; CHECK-NEXT: vpt.i16 eq, q2, zr
; CHECK-NEXT: vmlavt.u16 r0, q0, q1
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i8> %b, zeroinitializer
%xx = sext <8 x i8> %x to <8 x i16>
%yy = sext <8 x i8> %y to <8 x i16>
%m = mul <8 x i16> %xx, %yy
%s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
ret i16 %z
}
define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
; CHECK-LABEL: add_v16i8_v16i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i8 eq, q2, zr
; CHECK-NEXT: vmlavt.u8 r0, q0, q1
; CHECK-NEXT: uxtb r0, r0
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <16 x i8> %b, zeroinitializer
%m = mul <16 x i8> %x, %y
%s = select <16 x i1> %c, <16 x i8> %m, <16 x i8> zeroinitializer
%z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s)
ret i8 %z
}
define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
; CHECK-LABEL: add_v16i8_v16i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: .pad #32
; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: vmov q4, q0
; CHECK-NEXT: vcmp.i8 eq, q2, zr
; CHECK-NEXT: vmov.i8 q2, #0xff
; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vpsel q5, q2, q0
; CHECK-NEXT: vmov q3, q2
; CHECK-NEXT: vmov.u8 r0, q5[0]
; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill
; CHECK-NEXT: vmov.16 q2[0], r0
; CHECK-NEXT: vmov.u8 r0, q5[1]
; CHECK-NEXT: vmov.16 q2[1], r0
; CHECK-NEXT: vmov.u8 r0, q5[2]
; CHECK-NEXT: vmov.16 q2[2], r0
; CHECK-NEXT: vmov.u8 r0, q5[3]
; CHECK-NEXT: vmov.16 q2[3], r0
; CHECK-NEXT: vmov.u8 r0, q5[4]
; CHECK-NEXT: vmov.16 q2[4], r0
; CHECK-NEXT: vmov.u8 r0, q5[5]
; CHECK-NEXT: vmov.16 q2[5], r0
; CHECK-NEXT: vmov.u8 r0, q5[6]
; CHECK-NEXT: vmov.16 q2[6], r0
; CHECK-NEXT: vmov.u8 r0, q5[7]
; CHECK-NEXT: vmov.16 q2[7], r0
; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
; CHECK-NEXT: vcmp.i16 ne, q2, zr
; CHECK-NEXT: vmov.u8 r3, q4[0]
; CHECK-NEXT: vpsel q6, q3, q0
; CHECK-NEXT: vmov.u16 r0, q6[2]
; CHECK-NEXT: vmov.u16 r1, q6[0]
; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
; CHECK-NEXT: vmov.u16 r0, q6[3]
; CHECK-NEXT: vmov.u16 r1, q6[1]
; CHECK-NEXT: vmov q2[3], q2[1], r1, r0
; CHECK-NEXT: vcmp.i32 ne, q2, zr
; CHECK-NEXT: vmov.i64 q2, #0xff
; CHECK-NEXT: vmrs r0, p0
; CHECK-NEXT: and r2, r0, #1
; CHECK-NEXT: ubfx r1, r0, #4, #1
; CHECK-NEXT: rsbs r2, r2, #0
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: vmov q7[2], q7[0], r2, r1
; CHECK-NEXT: vmov q7[3], q7[1], r2, r1
; CHECK-NEXT: vmov.u8 r1, q1[1]
; CHECK-NEXT: vmov.u8 r2, q1[0]
; CHECK-NEXT: vmov q0[2], q0[0], r2, r1
; CHECK-NEXT: vmov.u8 r2, q4[1]
; CHECK-NEXT: vmov q3[2], q3[0], r3, r2
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: vmov r2, s14
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: umull r1, r12, r2, r1
; CHECK-NEXT: vmov r2, s12
; CHECK-NEXT: umull r2, r3, r2, r3
; CHECK-NEXT: vmov q0[2], q0[0], r2, r1
; CHECK-NEXT: vmov q0[3], q0[1], r3, r12
; CHECK-NEXT: vand q0, q0, q7
; CHECK-NEXT: vmov r1, r12, d1
; CHECK-NEXT: vmov r3, r2, d0
; CHECK-NEXT: adds.w lr, r3, r1
; CHECK-NEXT: ubfx r3, r0, #12, #1
; CHECK-NEXT: ubfx r0, r0, #8, #1
; CHECK-NEXT: rsb.w r3, r3, #0
; CHECK-NEXT: rsb.w r0, r0, #0
; CHECK-NEXT: vmov.u8 r1, q4[2]
; CHECK-NEXT: vmov q7[2], q7[0], r0, r3
; CHECK-NEXT: adc.w r12, r12, r2
; CHECK-NEXT: vmov q7[3], q7[1], r0, r3
; CHECK-NEXT: vmov.u8 r0, q1[3]
; CHECK-NEXT: vmov.u8 r3, q1[2]
; CHECK-NEXT: vmov q0[2], q0[0], r3, r0
; CHECK-NEXT: vmov.u8 r3, q4[3]
; CHECK-NEXT: vmov q3[2], q3[0], r1, r3
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vmov r0, s2
; CHECK-NEXT: vmov r1, s14
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: vmov r2, s12
; CHECK-NEXT: umull r0, r1, r1, r0
; CHECK-NEXT: umull r2, r3, r2, r3
; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
; CHECK-NEXT: vand q0, q0, q7
; CHECK-NEXT: vmov q7, q4
; CHECK-NEXT: vmov r0, r1, d0
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: adds.w r0, r0, lr
; CHECK-NEXT: adc.w r1, r1, r12
; CHECK-NEXT: adds.w r12, r0, r2
; CHECK-NEXT: adc.w lr, r1, r3
; CHECK-NEXT: vmov.u16 r2, q6[6]
; CHECK-NEXT: vmov.u16 r3, q6[4]
; CHECK-NEXT: vmov.u8 r1, q4[4]
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: vmov.u16 r2, q6[7]
; CHECK-NEXT: vmov.u16 r3, q6[5]
; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
; CHECK-NEXT: vcmp.i32 ne, q0, zr
; CHECK-NEXT: vmrs r2, p0
; CHECK-NEXT: and r0, r2, #1
; CHECK-NEXT: ubfx r3, r2, #4, #1
; CHECK-NEXT: rsbs r0, r0, #0
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: vmov q6[2], q6[0], r0, r3
; CHECK-NEXT: vmov q6[3], q6[1], r0, r3
; CHECK-NEXT: vmov.u8 r0, q1[5]
; CHECK-NEXT: vmov.u8 r3, q1[4]
; CHECK-NEXT: vmov q0[2], q0[0], r3, r0
; CHECK-NEXT: vmov.u8 r3, q4[5]
; CHECK-NEXT: vmov q3[2], q3[0], r1, r3
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vmov r0, s2
; CHECK-NEXT: vmov r1, s14
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: vmov r4, s12
; CHECK-NEXT: umull r0, r1, r1, r0
; CHECK-NEXT: umull r3, r4, r4, r3
; CHECK-NEXT: vmov q0[2], q0[0], r3, r0
; CHECK-NEXT: vmov q0[3], q0[1], r4, r1
; CHECK-NEXT: vand q0, q0, q6
; CHECK-NEXT: vmov r0, r1, d0
; CHECK-NEXT: vmov r3, r4, d1
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adc.w r1, r1, lr
; CHECK-NEXT: adds.w r12, r0, r3
; CHECK-NEXT: ubfx r3, r2, #12, #1
; CHECK-NEXT: ubfx r2, r2, #8, #1
; CHECK-NEXT: rsb.w r3, r3, #0
; CHECK-NEXT: rsb.w r2, r2, #0
; CHECK-NEXT: vmov q6[2], q6[0], r2, r3
; CHECK-NEXT: adcs r1, r4
; CHECK-NEXT: vmov q6[3], q6[1], r2, r3
; CHECK-NEXT: vmov.u8 r2, q1[7]
; CHECK-NEXT: vmov.u8 r3, q1[6]
; CHECK-NEXT: vmov.u8 r4, q4[6]
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: vmov.u8 r3, q4[7]
; CHECK-NEXT: vmov q3[2], q3[0], r4, r3
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov r3, s14
; CHECK-NEXT: vmov r0, s12
; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: umull r0, r4, r0, r4
; CHECK-NEXT: vmov q0[2], q0[0], r0, r2
; CHECK-NEXT: vmov q0[3], q0[1], r4, r3
; CHECK-NEXT: vand q0, q0, q6
; CHECK-NEXT: vmov r0, r2, d0
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
; CHECK-NEXT: adds.w r12, r0, r2
; CHECK-NEXT: vmov.u8 r2, q5[8]
; CHECK-NEXT: vmov.16 q6[0], r2
; CHECK-NEXT: vmov.u8 r2, q5[9]
; CHECK-NEXT: vmov.16 q6[1], r2
; CHECK-NEXT: vmov.u8 r2, q5[10]
; CHECK-NEXT: vmov.16 q6[2], r2
; CHECK-NEXT: vmov.u8 r2, q5[11]
; CHECK-NEXT: vmov.16 q6[3], r2
; CHECK-NEXT: vmov.u8 r2, q5[12]
; CHECK-NEXT: vmov.16 q6[4], r2
; CHECK-NEXT: vmov.u8 r2, q5[13]
; CHECK-NEXT: vmov.16 q6[5], r2
; CHECK-NEXT: vmov.u8 r2, q5[14]
; CHECK-NEXT: vmov.16 q6[6], r2
; CHECK-NEXT: vmov.u8 r2, q5[15]
; CHECK-NEXT: vmov.16 q6[7], r2
; CHECK-NEXT: adc.w lr, r1, r3
; CHECK-NEXT: vcmp.i16 ne, q6, zr
; CHECK-NEXT: vmov.u8 r0, q7[8]
; CHECK-NEXT: vpsel q3, q3, q0
; CHECK-NEXT: vmov.u16 r2, q3[2]
; CHECK-NEXT: vmov.u16 r3, q3[0]
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: vmov.u16 r2, q3[3]
; CHECK-NEXT: vmov.u16 r3, q3[1]
; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
; CHECK-NEXT: vcmp.i32 ne, q0, zr
; CHECK-NEXT: vmrs r2, p0
; CHECK-NEXT: and r4, r2, #1
; CHECK-NEXT: ubfx r3, r2, #4, #1
; CHECK-NEXT: rsbs r4, r4, #0
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: vmov q4[2], q4[0], r4, r3
; CHECK-NEXT: vmov q4[3], q4[1], r4, r3
; CHECK-NEXT: vmov.u8 r3, q1[9]
; CHECK-NEXT: vmov.u8 r4, q1[8]
; CHECK-NEXT: vmov q0[2], q0[0], r4, r3
; CHECK-NEXT: vmov.u8 r4, q7[9]
; CHECK-NEXT: vmov q5[2], q5[0], r0, r4
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vand q5, q5, q2
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: vmov r0, s22
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: vmov r1, s20
; CHECK-NEXT: umull r0, r3, r0, r3
; CHECK-NEXT: umull r1, r4, r1, r4
; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
; CHECK-NEXT: vmov q0[3], q0[1], r4, r3
; CHECK-NEXT: vand q0, q0, q4
; CHECK-NEXT: vmov r0, r1, d0
; CHECK-NEXT: vmov r3, r4, d1
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adc.w r1, r1, lr
; CHECK-NEXT: adds.w r12, r0, r3
; CHECK-NEXT: ubfx r3, r2, #12, #1
; CHECK-NEXT: ubfx r2, r2, #8, #1
; CHECK-NEXT: rsb.w r3, r3, #0
; CHECK-NEXT: rsb.w r2, r2, #0
; CHECK-NEXT: vmov q4[2], q4[0], r2, r3
; CHECK-NEXT: adcs r1, r4
; CHECK-NEXT: vmov q4[3], q4[1], r2, r3
; CHECK-NEXT: vmov.u8 r2, q1[11]
; CHECK-NEXT: vmov.u8 r3, q1[10]
; CHECK-NEXT: vmov.u8 r4, q7[10]
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: vmov.u8 r3, q7[11]
; CHECK-NEXT: vmov q5[2], q5[0], r4, r3
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vand q5, q5, q2
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov r3, s22
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: vmov r0, s20
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: umull r0, r4, r0, r4
; CHECK-NEXT: vmov q0[2], q0[0], r0, r2
; CHECK-NEXT: vmov q0[3], q0[1], r4, r3
; CHECK-NEXT: vand q0, q0, q4
; CHECK-NEXT: vmov r0, r2, d0
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: adds.w r12, r0, r2
; CHECK-NEXT: vmov.u16 r2, q3[6]
; CHECK-NEXT: adc.w lr, r1, r3
; CHECK-NEXT: vmov.u16 r3, q3[4]
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: vmov.u16 r2, q3[7]
; CHECK-NEXT: vmov.u16 r3, q3[5]
; CHECK-NEXT: vmov.u8 r0, q7[12]
; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
; CHECK-NEXT: vcmp.i32 ne, q0, zr
; CHECK-NEXT: vmrs r2, p0
; CHECK-NEXT: and r4, r2, #1
; CHECK-NEXT: ubfx r3, r2, #4, #1
; CHECK-NEXT: rsbs r4, r4, #0
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: vmov q3[2], q3[0], r4, r3
; CHECK-NEXT: vmov q3[3], q3[1], r4, r3
; CHECK-NEXT: vmov.u8 r3, q1[13]
; CHECK-NEXT: vmov.u8 r4, q1[12]
; CHECK-NEXT: vmov q0[2], q0[0], r4, r3
; CHECK-NEXT: vmov.u8 r4, q7[13]
; CHECK-NEXT: vmov q4[2], q4[0], r0, r4
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vand q4, q4, q2
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: vmov r0, s18
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: vmov r1, s16
; CHECK-NEXT: umull r0, r3, r0, r3
; CHECK-NEXT: umull r1, r4, r1, r4
; CHECK-NEXT: vmov q0[2], q0[0], r1, r0
; CHECK-NEXT: vmov q0[3], q0[1], r4, r3
; CHECK-NEXT: vand q0, q0, q3
; CHECK-NEXT: vmov r0, r1, d0
; CHECK-NEXT: vmov r3, r4, d1
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adc.w r1, r1, lr
; CHECK-NEXT: adds.w r12, r0, r3
; CHECK-NEXT: ubfx r3, r2, #12, #1
; CHECK-NEXT: ubfx r2, r2, #8, #1
; CHECK-NEXT: rsb.w r3, r3, #0
; CHECK-NEXT: rsb.w r2, r2, #0
; CHECK-NEXT: vmov q3[2], q3[0], r2, r3
; CHECK-NEXT: adcs r1, r4
; CHECK-NEXT: vmov q3[3], q3[1], r2, r3
; CHECK-NEXT: vmov.u8 r2, q1[15]
; CHECK-NEXT: vmov.u8 r3, q1[14]
; CHECK-NEXT: vmov.u8 r4, q7[14]
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: vmov.u8 r3, q7[15]
; CHECK-NEXT: vmov q1[2], q1[0], r4, r3
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov r3, s6
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: vmov r0, s4
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: umull r0, r4, r0, r4
; CHECK-NEXT: vmov q0[2], q0[0], r0, r2
; CHECK-NEXT: vmov q0[3], q0[1], r4, r3
; CHECK-NEXT: vand q0, q0, q3
; CHECK-NEXT: vmov r0, r2, d0
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: add sp, #32
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: pop {r4, pc}
entry:
%c = icmp eq <16 x i8> %b, zeroinitializer
%xx = zext <16 x i8> %x to <16 x i64>
%yy = zext <16 x i8> %y to <16 x i64>
%m = mul <16 x i64> %xx, %yy
%s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
; CHECK-LABEL: add_v16i8_v16i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vcmp.i8 eq, q2, zr
; CHECK-NEXT: vmov.i8 q2, #0x0
; CHECK-NEXT: vmov.i8 q3, #0xff
; CHECK-NEXT: vmov.s8 r3, q1[0]
; CHECK-NEXT: vpsel q4, q3, q2
; CHECK-NEXT: vmov.s8 r4, q0[4]
; CHECK-NEXT: vmov.u8 r0, q4[0]
; CHECK-NEXT: vmov.16 q5[0], r0
; CHECK-NEXT: vmov.u8 r0, q4[1]
; CHECK-NEXT: vmov.16 q5[1], r0
; CHECK-NEXT: vmov.u8 r0, q4[2]
; CHECK-NEXT: vmov.16 q5[2], r0
; CHECK-NEXT: vmov.u8 r0, q4[3]
; CHECK-NEXT: vmov.16 q5[3], r0
; CHECK-NEXT: vmov.u8 r0, q4[4]
; CHECK-NEXT: vmov.16 q5[4], r0
; CHECK-NEXT: vmov.u8 r0, q4[5]
; CHECK-NEXT: vmov.16 q5[5], r0
; CHECK-NEXT: vmov.u8 r0, q4[6]
; CHECK-NEXT: vmov.16 q5[6], r0
; CHECK-NEXT: vmov.u8 r0, q4[7]
; CHECK-NEXT: vmov.16 q5[7], r0
; CHECK-NEXT: vcmp.i16 ne, q5, zr
; CHECK-NEXT: vpsel q5, q3, q2
; CHECK-NEXT: vmov.u16 r0, q5[2]
; CHECK-NEXT: vmov.u16 r1, q5[0]
; CHECK-NEXT: vmov q6[2], q6[0], r1, r0
; CHECK-NEXT: vmov.u16 r0, q5[3]
; CHECK-NEXT: vmov.u16 r1, q5[1]
; CHECK-NEXT: vmov q6[3], q6[1], r1, r0
; CHECK-NEXT: vcmp.i32 ne, q6, zr
; CHECK-NEXT: vmrs r0, p0
; CHECK-NEXT: and r2, r0, #1
; CHECK-NEXT: ubfx r1, r0, #4, #1
; CHECK-NEXT: rsbs r2, r2, #0
; CHECK-NEXT: rsbs r1, r1, #0
; CHECK-NEXT: vmov q6[2], q6[0], r2, r1
; CHECK-NEXT: vmov q6[3], q6[1], r2, r1
; CHECK-NEXT: vmov.s8 r1, q1[1]
; CHECK-NEXT: vmov.s8 r2, q0[1]
; CHECK-NEXT: smull r1, r12, r2, r1
; CHECK-NEXT: vmov.s8 r2, q0[0]
; CHECK-NEXT: smull r2, r3, r2, r3
; CHECK-NEXT: vmov q7[2], q7[0], r2, r1
; CHECK-NEXT: vmov q7[3], q7[1], r3, r12
; CHECK-NEXT: vand q6, q7, q6
; CHECK-NEXT: vmov r1, r12, d13
; CHECK-NEXT: vmov r3, r2, d12
; CHECK-NEXT: adds.w lr, r3, r1
; CHECK-NEXT: ubfx r3, r0, #12, #1
; CHECK-NEXT: ubfx r0, r0, #8, #1
; CHECK-NEXT: rsb.w r3, r3, #0
; CHECK-NEXT: rsb.w r0, r0, #0
; CHECK-NEXT: adc.w r12, r12, r2
; CHECK-NEXT: vmov q6[2], q6[0], r0, r3
; CHECK-NEXT: vmov.s8 r1, q1[2]
; CHECK-NEXT: vmov q6[3], q6[1], r0, r3
; CHECK-NEXT: vmov.s8 r2, q0[2]
; CHECK-NEXT: vmov.s8 r0, q1[3]
; CHECK-NEXT: vmov.s8 r3, q0[3]
; CHECK-NEXT: smull r0, r3, r3, r0
; CHECK-NEXT: smull r1, r2, r2, r1
; CHECK-NEXT: vmov q7[2], q7[0], r1, r0
; CHECK-NEXT: vmov q7[3], q7[1], r2, r3
; CHECK-NEXT: vand q6, q7, q6
; CHECK-NEXT: vmov r0, r1, d12
; CHECK-NEXT: vmov r2, r3, d13
; CHECK-NEXT: adds.w r0, r0, lr
; CHECK-NEXT: adc.w r1, r1, r12
; CHECK-NEXT: adds.w r12, r0, r2
; CHECK-NEXT: adc.w lr, r1, r3
; CHECK-NEXT: vmov.u16 r2, q5[6]
; CHECK-NEXT: vmov.u16 r3, q5[4]
; CHECK-NEXT: vmov.s8 r1, q1[4]
; CHECK-NEXT: vmov q6[2], q6[0], r3, r2
; CHECK-NEXT: vmov.u16 r2, q5[7]
; CHECK-NEXT: vmov.u16 r3, q5[5]
; CHECK-NEXT: smull r1, r4, r4, r1
; CHECK-NEXT: vmov q6[3], q6[1], r3, r2
; CHECK-NEXT: vcmp.i32 ne, q6, zr
; CHECK-NEXT: vmrs r2, p0
; CHECK-NEXT: and r0, r2, #1
; CHECK-NEXT: ubfx r3, r2, #4, #1
; CHECK-NEXT: rsbs r0, r0, #0
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: vmov q5[2], q5[0], r0, r3
; CHECK-NEXT: vmov q5[3], q5[1], r0, r3
; CHECK-NEXT: vmov.s8 r0, q1[5]
; CHECK-NEXT: vmov.s8 r3, q0[5]
; CHECK-NEXT: smull r0, r3, r3, r0
; CHECK-NEXT: vmov q6[2], q6[0], r1, r0
; CHECK-NEXT: vmov q6[3], q6[1], r4, r3
; CHECK-NEXT: vand q5, q6, q5
; CHECK-NEXT: vmov r0, r1, d10
; CHECK-NEXT: vmov r3, r4, d11
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adc.w r1, r1, lr
; CHECK-NEXT: adds.w r12, r0, r3
; CHECK-NEXT: ubfx r3, r2, #12, #1
; CHECK-NEXT: ubfx r2, r2, #8, #1
; CHECK-NEXT: rsb.w r3, r3, #0
; CHECK-NEXT: rsb.w r2, r2, #0
; CHECK-NEXT: vmov q5[2], q5[0], r2, r3
; CHECK-NEXT: adcs r1, r4
; CHECK-NEXT: vmov q5[3], q5[1], r2, r3
; CHECK-NEXT: vmov.s8 r2, q1[7]
; CHECK-NEXT: vmov.s8 r3, q0[7]
; CHECK-NEXT: vmov.s8 r4, q1[6]
; CHECK-NEXT: vmov.s8 r0, q0[6]
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: smull r0, r4, r0, r4
; CHECK-NEXT: vmov q6[2], q6[0], r0, r2
; CHECK-NEXT: vmov q6[3], q6[1], r4, r3
; CHECK-NEXT: vand q5, q6, q5
; CHECK-NEXT: vmov r0, r2, d10
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: vmov r2, r3, d11
; CHECK-NEXT: adds.w r12, r0, r2
; CHECK-NEXT: vmov.u8 r2, q4[8]
; CHECK-NEXT: vmov.16 q5[0], r2
; CHECK-NEXT: vmov.u8 r2, q4[9]
; CHECK-NEXT: vmov.16 q5[1], r2
; CHECK-NEXT: vmov.u8 r2, q4[10]
; CHECK-NEXT: vmov.16 q5[2], r2
; CHECK-NEXT: vmov.u8 r2, q4[11]
; CHECK-NEXT: vmov.16 q5[3], r2
; CHECK-NEXT: vmov.u8 r2, q4[12]
; CHECK-NEXT: vmov.16 q5[4], r2
; CHECK-NEXT: vmov.u8 r2, q4[13]
; CHECK-NEXT: vmov.16 q5[5], r2
; CHECK-NEXT: vmov.u8 r2, q4[14]
; CHECK-NEXT: vmov.16 q5[6], r2
; CHECK-NEXT: vmov.u8 r2, q4[15]
; CHECK-NEXT: vmov.16 q5[7], r2
; CHECK-NEXT: adc.w lr, r1, r3
; CHECK-NEXT: vcmp.i16 ne, q5, zr
; CHECK-NEXT: vmov.s8 r0, q1[8]
; CHECK-NEXT: vpsel q2, q3, q2
; CHECK-NEXT: vmov.s8 r1, q0[8]
; CHECK-NEXT: vmov.u16 r2, q2[2]
; CHECK-NEXT: vmov.u16 r3, q2[0]
; CHECK-NEXT: vmov q3[2], q3[0], r3, r2
; CHECK-NEXT: vmov.u16 r2, q2[3]
; CHECK-NEXT: vmov.u16 r3, q2[1]
; CHECK-NEXT: smull r0, r1, r1, r0
; CHECK-NEXT: vmov q3[3], q3[1], r3, r2
; CHECK-NEXT: vcmp.i32 ne, q3, zr
; CHECK-NEXT: vmrs r2, p0
; CHECK-NEXT: and r4, r2, #1
; CHECK-NEXT: ubfx r3, r2, #4, #1
; CHECK-NEXT: rsbs r4, r4, #0
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: vmov q3[2], q3[0], r4, r3
; CHECK-NEXT: vmov q3[3], q3[1], r4, r3
; CHECK-NEXT: vmov.s8 r3, q1[9]
; CHECK-NEXT: vmov.s8 r4, q0[9]
; CHECK-NEXT: smull r3, r4, r4, r3
; CHECK-NEXT: vmov q4[2], q4[0], r0, r3
; CHECK-NEXT: vmov q4[3], q4[1], r1, r4
; CHECK-NEXT: vand q3, q4, q3
; CHECK-NEXT: vmov r0, r1, d6
; CHECK-NEXT: vmov r3, r4, d7
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adc.w r1, r1, lr
; CHECK-NEXT: adds.w r12, r0, r3
; CHECK-NEXT: ubfx r3, r2, #12, #1
; CHECK-NEXT: ubfx r2, r2, #8, #1
; CHECK-NEXT: rsb.w r3, r3, #0
; CHECK-NEXT: rsb.w r2, r2, #0
; CHECK-NEXT: vmov q3[2], q3[0], r2, r3
; CHECK-NEXT: adcs r1, r4
; CHECK-NEXT: vmov q3[3], q3[1], r2, r3
; CHECK-NEXT: vmov.s8 r2, q1[11]
; CHECK-NEXT: vmov.s8 r3, q0[11]
; CHECK-NEXT: vmov.s8 r4, q1[10]
; CHECK-NEXT: vmov.s8 r0, q0[10]
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: smull r0, r4, r0, r4
; CHECK-NEXT: vmov q4[2], q4[0], r0, r2
; CHECK-NEXT: vmov q4[3], q4[1], r4, r3
; CHECK-NEXT: vand q3, q4, q3
; CHECK-NEXT: vmov r0, r2, d6
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: vmov r2, r3, d7
; CHECK-NEXT: adds.w r12, r0, r2
; CHECK-NEXT: vmov.u16 r2, q2[6]
; CHECK-NEXT: adc.w lr, r1, r3
; CHECK-NEXT: vmov.u16 r3, q2[4]
; CHECK-NEXT: vmov q3[2], q3[0], r3, r2
; CHECK-NEXT: vmov.u16 r2, q2[7]
; CHECK-NEXT: vmov.u16 r3, q2[5]
; CHECK-NEXT: vmov.s8 r0, q1[12]
; CHECK-NEXT: vmov q3[3], q3[1], r3, r2
; CHECK-NEXT: vmov.s8 r1, q0[12]
; CHECK-NEXT: vcmp.i32 ne, q3, zr
; CHECK-NEXT: smull r0, r1, r1, r0
; CHECK-NEXT: vmrs r2, p0
; CHECK-NEXT: and r4, r2, #1
; CHECK-NEXT: ubfx r3, r2, #4, #1
; CHECK-NEXT: rsbs r4, r4, #0
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: vmov q2[2], q2[0], r4, r3
; CHECK-NEXT: vmov q2[3], q2[1], r4, r3
; CHECK-NEXT: vmov.s8 r3, q1[13]
; CHECK-NEXT: vmov.s8 r4, q0[13]
; CHECK-NEXT: smull r3, r4, r4, r3
; CHECK-NEXT: vmov q3[2], q3[0], r0, r3
; CHECK-NEXT: vmov q3[3], q3[1], r1, r4
; CHECK-NEXT: vand q2, q3, q2
; CHECK-NEXT: vmov r0, r1, d4
; CHECK-NEXT: vmov r3, r4, d5
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adc.w r1, r1, lr
; CHECK-NEXT: adds.w r12, r0, r3
; CHECK-NEXT: ubfx r3, r2, #12, #1
; CHECK-NEXT: ubfx r2, r2, #8, #1
; CHECK-NEXT: rsb.w r3, r3, #0
; CHECK-NEXT: rsb.w r2, r2, #0
; CHECK-NEXT: vmov q2[2], q2[0], r2, r3
; CHECK-NEXT: adcs r1, r4
; CHECK-NEXT: vmov q2[3], q2[1], r2, r3
; CHECK-NEXT: vmov.s8 r2, q1[15]
; CHECK-NEXT: vmov.s8 r3, q0[15]
; CHECK-NEXT: vmov.s8 r4, q1[14]
; CHECK-NEXT: vmov.s8 r0, q0[14]
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: smull r0, r4, r0, r4
; CHECK-NEXT: vmov q0[2], q0[0], r0, r2
; CHECK-NEXT: vmov q0[3], q0[1], r4, r3
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmov r0, r2, d0
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: pop {r4, pc}
entry:
%c = icmp eq <16 x i8> %b, zeroinitializer
%xx = sext <16 x i8> %x to <16 x i64>
%yy = sext <16 x i8> %y to <16 x i64>
%m = mul <16 x i64> %xx, %yy
%s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
; CHECK-LABEL: add_v8i8_v8i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u8 q1, q1
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: vmovlb.u8 q2, q2
; CHECK-NEXT: vpt.i16 eq, q2, zr
; CHECK-NEXT: vmlalvt.u16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i8> %b, zeroinitializer
%xx = zext <8 x i8> %x to <8 x i64>
%yy = zext <8 x i8> %y to <8 x i64>
%m = mul <8 x i64> %xx, %yy
%s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
; CHECK-LABEL: add_v8i8_v8i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q1, q1
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmovlb.u8 q2, q2
; CHECK-NEXT: vpt.i16 eq, q2, zr
; CHECK-NEXT: vmlalvt.s16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i8> %b, zeroinitializer
%xx = sext <8 x i8> %x to <8 x i64>
%yy = sext <8 x i8> %y to <8 x i64>
%m = mul <8 x i64> %xx, %yy
%s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) {
; CHECK-LABEL: add_v4i8_v4i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i32 q3, #0xff
; CHECK-NEXT: vand q1, q1, q3
; CHECK-NEXT: vand q0, q0, q3
; CHECK-NEXT: vand q2, q2, q3
; CHECK-NEXT: vpt.i32 eq, q2, zr
; CHECK-NEXT: vmlalvt.u32 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <4 x i8> %b, zeroinitializer
%xx = zext <4 x i8> %x to <4 x i64>
%yy = zext <4 x i8> %y to <4 x i64>
%m = mul <4 x i64> %xx, %yy
%s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) {
; CHECK-LABEL: add_v4i8_v4i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q1, q1
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmov.i32 q3, #0xff
; CHECK-NEXT: vmovlb.s16 q1, q1
; CHECK-NEXT: vand q2, q2, q3
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vpt.i32 eq, q2, zr
; CHECK-NEXT: vmlalvt.s32 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <4 x i8> %b, zeroinitializer
%xx = sext <4 x i8> %x to <4 x i64>
%yy = sext <4 x i8> %y to <4 x i64>
%m = mul <4 x i64> %xx, %yy
%s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b) {
; CHECK-LABEL: add_v2i8_v2i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i64 q3, #0xff
; CHECK-NEXT: vand q1, q1, q3
; CHECK-NEXT: vand q0, q0, q3
; CHECK-NEXT: vmov r0, s6
; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: vmov r2, s4
; CHECK-NEXT: vand q1, q2, q3
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: umull r0, r1, r1, r0
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
; CHECK-NEXT: vmov r0, s6
; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
; CHECK-NEXT: vmov r1, s4
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: cset r0, eq
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: csetm r0, ne
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: cset r1, eq
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: csetm r1, ne
; CHECK-NEXT: vmov q1[2], q1[0], r1, r0
; CHECK-NEXT: vmov q1[3], q1[1], r1, r0
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vmov r0, r1, d1
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <2 x i8> %b, zeroinitializer
%xx = zext <2 x i8> %x to <2 x i64>
%yy = zext <2 x i8> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b) {
; CHECK-LABEL: add_v2i8_v2i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i32 q3, #0xff
; CHECK-NEXT: vmov r2, s4
; CHECK-NEXT: vand q2, q2, q3
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: vmov r0, s10
; CHECK-NEXT: vmov r1, s8
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: cset r0, eq
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: csetm r0, ne
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: cset r1, eq
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: csetm r1, ne
; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
; CHECK-NEXT: vmov q2[3], q2[1], r1, r0
; CHECK-NEXT: vmov r0, s6
; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: sxtb r0, r0
; CHECK-NEXT: sxtb r1, r1
; CHECK-NEXT: smull r0, r1, r1, r0
; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmov r0, r1, d1
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <2 x i8> %b, zeroinitializer
%xx = sext <2 x i8> %x to <2 x i64>
%yy = sext <2 x i8> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b) {
; CHECK-LABEL: add_v2i64_v2i64:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
; CHECK-NEXT: vmov r0, r12, d3
; CHECK-NEXT: vmov r2, lr, d1
; CHECK-NEXT: vmov r4, r9, d2
; CHECK-NEXT: vmov r6, r7, d0
; CHECK-NEXT: umull r1, r8, r2, r0
; CHECK-NEXT: umull r3, r5, r6, r4
; CHECK-NEXT: vmov q0[2], q0[0], r3, r1
; CHECK-NEXT: mla r1, r2, r12, r8
; CHECK-NEXT: mla r0, lr, r0, r1
; CHECK-NEXT: mla r1, r6, r9, r5
; CHECK-NEXT: mla r1, r7, r4, r1
; CHECK-NEXT: vmov q0[3], q0[1], r1, r0
; CHECK-NEXT: vmov r0, r1, d5
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: vmov r1, r2, d4
; CHECK-NEXT: cset r0, eq
; CHECK-NEXT: cmp r0, #0
; CHECK-NEXT: csetm r0, ne
; CHECK-NEXT: orrs r1, r2
; CHECK-NEXT: cset r1, eq
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: csetm r1, ne
; CHECK-NEXT: vmov q1[2], q1[0], r1, r0
; CHECK-NEXT: vmov q1[3], q1[1], r1, r0
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vmov r0, r1, d1
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, pc}
entry:
%c = icmp eq <2 x i64> %b, zeroinitializer
%m = mul <2 x i64> %x, %y
%s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
ret i64 %z
}
define arm_aapcs_vfpcc i32 @add_v4i32_v4i32_acc(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b, i32 %a) {
; CHECK-LABEL: add_v4i32_v4i32_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i32 eq, q2, zr
; CHECK-NEXT: vmlavat.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <4 x i32> %b, zeroinitializer
%m = mul <4 x i32> %x, %y
%s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b, i64 %a) {
; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i32 eq, q2, zr
; CHECK-NEXT: vmlalvat.u32 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <4 x i32> %b, zeroinitializer
%xx = zext <4 x i32> %x to <4 x i64>
%yy = zext <4 x i32> %y to <4 x i64>
%m = mul <4 x i64> %xx, %yy
%s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b, i64 %a) {
; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i32 eq, q2, zr
; CHECK-NEXT: vmlalvat.s32 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <4 x i32> %b, zeroinitializer
%xx = sext <4 x i32> %x to <4 x i64>
%yy = sext <4 x i32> %y to <4 x i64>
%m = mul <4 x i64> %xx, %yy
%s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b, i64 %a) {
; CHECK-LABEL: add_v2i32_v2i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: vmullb.u32 q3, q0, q1
; CHECK-NEXT: vmov r3, s8
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: cset r2, eq
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: csetm r2, ne
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: cset r3, eq
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: csetm r3, ne
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
; CHECK-NEXT: vand q0, q3, q0
; CHECK-NEXT: vmov lr, r12, d1
; CHECK-NEXT: vmov r3, r2, d0
; CHECK-NEXT: adds.w r3, r3, lr
; CHECK-NEXT: adc.w r2, r2, r12
; CHECK-NEXT: adds r0, r0, r3
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%c = icmp eq <2 x i32> %b, zeroinitializer
%xx = zext <2 x i32> %x to <2 x i64>
%yy = zext <2 x i32> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b, i64 %a) {
; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: vmullb.s32 q3, q0, q1
; CHECK-NEXT: vmov r3, s8
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: cset r2, eq
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: csetm r2, ne
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: cset r3, eq
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: csetm r3, ne
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
; CHECK-NEXT: vand q0, q3, q0
; CHECK-NEXT: vmov lr, r12, d1
; CHECK-NEXT: vmov r3, r2, d0
; CHECK-NEXT: adds.w r3, r3, lr
; CHECK-NEXT: adc.w r2, r2, r12
; CHECK-NEXT: adds r0, r0, r3
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%c = icmp eq <2 x i32> %b, zeroinitializer
%xx = sext <2 x i32> %x to <2 x i64>
%yy = sext <2 x i32> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i32 %a) {
; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i16 eq, q2, zr
; CHECK-NEXT: vmlavat.u16 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i16> %b, zeroinitializer
%xx = zext <8 x i16> %x to <8 x i32>
%yy = zext <8 x i16> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i32 %a) {
; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i16 eq, q2, zr
; CHECK-NEXT: vmlavat.s16 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i16> %b, zeroinitializer
%xx = sext <8 x i16> %x to <8 x i32>
%yy = sext <8 x i16> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b, i32 %a) {
; CHECK-LABEL: add_v4i16_v4i32_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u16 q1, q1
; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vmovlb.u16 q2, q2
; CHECK-NEXT: vpt.i32 eq, q2, zr
; CHECK-NEXT: vmlavat.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <4 x i16> %b, zeroinitializer
%xx = zext <4 x i16> %x to <4 x i32>
%yy = zext <4 x i16> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b, i32 %a) {
; CHECK-LABEL: add_v4i16_v4i32_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s16 q1, q1
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmovlb.u16 q2, q2
; CHECK-NEXT: vpt.i32 eq, q2, zr
; CHECK-NEXT: vmlavat.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <4 x i16> %b, zeroinitializer
%xx = sext <4 x i16> %x to <4 x i32>
%yy = sext <4 x i16> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i16 %a) {
; CHECK-LABEL: add_v8i16_v8i16_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i16 eq, q2, zr
; CHECK-NEXT: vmlavat.u16 r0, q0, q1
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i16> %b, zeroinitializer
%m = mul <8 x i16> %x, %y
%s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
%r = add i16 %z, %a
ret i16 %r
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
; CHECK-LABEL: add_v8i16_v8i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i16 eq, q2, zr
; CHECK-NEXT: vmlalvat.u16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i16> %b, zeroinitializer
%xx = zext <8 x i16> %x to <8 x i64>
%yy = zext <8 x i16> %y to <8 x i64>
%m = mul <8 x i64> %xx, %yy
%s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
; CHECK-LABEL: add_v8i16_v8i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i16 eq, q2, zr
; CHECK-NEXT: vmlalvat.s16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i16> %b, zeroinitializer
%xx = sext <8 x i16> %x to <8 x i64>
%yy = sext <8 x i16> %y to <8 x i64>
%m = mul <8 x i64> %xx, %yy
%s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i16 eq, q2, zr
; CHECK-NEXT: vmlalvat.u16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i16> %b, zeroinitializer
%xx = zext <8 x i16> %x to <8 x i32>
%yy = zext <8 x i16> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%ma = zext <8 x i32> %m to <8 x i64>
%s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i16 eq, q2, zr
; CHECK-NEXT: vmlalvat.s16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i16> %b, zeroinitializer
%xx = sext <8 x i16> %x to <8 x i32>
%yy = sext <8 x i16> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%ma = sext <8 x i32> %m to <8 x i64>
%s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sextzext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sextzext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i16 eq, q2, zr
; CHECK-NEXT: vmlalvat.s16 r0, r1, q0, q0
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i16> %b, zeroinitializer
%xx = sext <8 x i16> %x to <8 x i32>
%m = mul <8 x i32> %xx, %xx
%ma = zext <8 x i32> %m to <8 x i64>
%s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b, i64 %a) {
; CHECK-LABEL: add_v2i16_v2i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov.i64 q3, #0xffff
; CHECK-NEXT: vand q1, q1, q3
; CHECK-NEXT: vand q0, q0, q3
; CHECK-NEXT: vmov r2, s6
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: umull lr, r12, r3, r2
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: vand q1, q2, q3
; CHECK-NEXT: umull r2, r3, r2, r3
; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
; CHECK-NEXT: vmov r2, s6
; CHECK-NEXT: vmov q0[3], q0[1], r3, r12
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: cset r2, eq
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: csetm r2, ne
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: cset r3, eq
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: csetm r3, ne
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vmov lr, r12, d1
; CHECK-NEXT: vmov r3, r2, d0
; CHECK-NEXT: adds.w r3, r3, lr
; CHECK-NEXT: adc.w r2, r2, r12
; CHECK-NEXT: adds r0, r0, r3
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%c = icmp eq <2 x i16> %b, zeroinitializer
%xx = zext <2 x i16> %x to <2 x i64>
%yy = zext <2 x i16> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b, i64 %a) {
; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov.i32 q3, #0xffff
; CHECK-NEXT: vand q2, q2, q3
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: vmov r3, s8
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: cset r2, eq
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: csetm r2, ne
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: cset r3, eq
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: csetm r3, ne
; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
; CHECK-NEXT: vmov q2[3], q2[1], r3, r2
; CHECK-NEXT: vmov r2, s6
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: sxth r2, r2
; CHECK-NEXT: sxth r3, r3
; CHECK-NEXT: smull lr, r12, r3, r2
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: sxth r3, r3
; CHECK-NEXT: sxth r2, r2
; CHECK-NEXT: smull r2, r3, r2, r3
; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
; CHECK-NEXT: vmov q0[3], q0[1], r3, r12
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmov lr, r12, d1
; CHECK-NEXT: vmov r3, r2, d0
; CHECK-NEXT: adds.w r3, r3, lr
; CHECK-NEXT: adc.w r2, r2, r12
; CHECK-NEXT: adds r0, r0, r3
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%c = icmp eq <2 x i16> %b, zeroinitializer
%xx = sext <2 x i16> %x to <2 x i64>
%yy = sext <2 x i16> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
; CHECK-LABEL: add_v16i8_v16i32_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i8 eq, q2, zr
; CHECK-NEXT: vmlavat.u8 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <16 x i8> %b, zeroinitializer
%xx = zext <16 x i8> %x to <16 x i32>
%yy = zext <16 x i8> %y to <16 x i32>
%m = mul <16 x i32> %xx, %yy
%s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
; CHECK-LABEL: add_v16i8_v16i32_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i8 eq, q2, zr
; CHECK-NEXT: vmlavat.s8 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <16 x i8> %b, zeroinitializer
%xx = sext <16 x i8> %x to <16 x i32>
%yy = sext <16 x i8> %y to <16 x i32>
%m = mul <16 x i32> %xx, %yy
%s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i8 eq, q2, zr
; CHECK-NEXT: vmlavat.u8 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <16 x i8> %b, zeroinitializer
%xx = zext <16 x i8> %x to <16 x i16>
%yy = zext <16 x i8> %y to <16 x i16>
%m = mul <16 x i16> %xx, %yy
%ma = zext <16 x i16> %m to <16 x i32>
%s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i8 eq, q2, zr
; CHECK-NEXT: vmlavat.s8 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <16 x i8> %b, zeroinitializer
%xx = sext <16 x i8> %x to <16 x i16>
%yy = sext <16 x i8> %y to <16 x i16>
%m = mul <16 x i16> %xx, %yy
%ma = sext <16 x i16> %m to <16 x i32>
%s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sextzext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sextzext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i8 eq, q2, zr
; CHECK-NEXT: vmlavat.s8 r0, q0, q0
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <16 x i8> %b, zeroinitializer
%xx = sext <16 x i8> %x to <16 x i16>
%m = mul <16 x i16> %xx, %xx
%ma = zext <16 x i16> %m to <16 x i32>
%s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b, i32 %a) {
; CHECK-LABEL: add_v4i8_v4i32_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i32 q3, #0xff
; CHECK-NEXT: vand q1, q1, q3
; CHECK-NEXT: vand q0, q0, q3
; CHECK-NEXT: vand q2, q2, q3
; CHECK-NEXT: vpt.i32 eq, q2, zr
; CHECK-NEXT: vmlavat.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <4 x i8> %b, zeroinitializer
%xx = zext <4 x i8> %x to <4 x i32>
%yy = zext <4 x i8> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b, i32 %a) {
; CHECK-LABEL: add_v4i8_v4i32_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q1, q1
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmov.i32 q3, #0xff
; CHECK-NEXT: vmovlb.s16 q1, q1
; CHECK-NEXT: vand q2, q2, q3
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vpt.i32 eq, q2, zr
; CHECK-NEXT: vmlavat.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <4 x i8> %b, zeroinitializer
%xx = sext <4 x i8> %x to <4 x i32>
%yy = sext <4 x i8> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i16 %a) {
; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i8 eq, q2, zr
; CHECK-NEXT: vmlavat.u8 r0, q0, q1
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <16 x i8> %b, zeroinitializer
%xx = zext <16 x i8> %x to <16 x i16>
%yy = zext <16 x i8> %y to <16 x i16>
%m = mul <16 x i16> %xx, %yy
%s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
%r = add i16 %z, %a
ret i16 %r
}
define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i16 %a) {
; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i8 eq, q2, zr
; CHECK-NEXT: vmlavat.s8 r0, q0, q1
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <16 x i8> %b, zeroinitializer
%xx = sext <16 x i8> %x to <16 x i16>
%yy = sext <16 x i8> %y to <16 x i16>
%m = mul <16 x i16> %xx, %yy
%s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
%r = add i16 %z, %a
ret i16 %r
}
define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b, i16 %a) {
; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u8 q1, q1
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: vmovlb.u8 q2, q2
; CHECK-NEXT: vpt.i16 eq, q2, zr
; CHECK-NEXT: vmlavat.u16 r0, q0, q1
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i8> %b, zeroinitializer
%xx = zext <8 x i8> %x to <8 x i16>
%yy = zext <8 x i8> %y to <8 x i16>
%m = mul <8 x i16> %xx, %yy
%s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
%r = add i16 %z, %a
ret i16 %r
}
define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b, i16 %a) {
; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q1, q1
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmovlb.u8 q2, q2
; CHECK-NEXT: vpt.i16 eq, q2, zr
; CHECK-NEXT: vmlavat.u16 r0, q0, q1
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <8 x i8> %b, zeroinitializer
%xx = sext <8 x i8> %x to <8 x i16>
%yy = sext <8 x i8> %y to <8 x i16>
%m = mul <8 x i16> %xx, %yy
%s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
%r = add i16 %z, %a
ret i16 %r
}
define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i8 %a) {
; CHECK-LABEL: add_v16i8_v16i8_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vpt.i8 eq, q2, zr
; CHECK-NEXT: vmlavat.u8 r0, q0, q1
; CHECK-NEXT: uxtb r0, r0
; CHECK-NEXT: bx lr
entry:
%c = icmp eq <16 x i8> %b, zeroinitializer
%m = mul <16 x i8> %x, %y
%s = select <16 x i1> %c, <16 x i8> %m, <16 x i8> zeroinitializer
%z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s)
%r = add i8 %z, %a
ret i8 %r
}
define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i64 %a) {
; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, lr}
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: .pad #32
; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: vmov q4, q0
; CHECK-NEXT: vcmp.i8 eq, q2, zr
; CHECK-NEXT: vmov.i8 q2, #0xff
; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vpsel q5, q2, q0
; CHECK-NEXT: vmov q3, q2
; CHECK-NEXT: vmov.u8 r2, q5[0]
; CHECK-NEXT: vstrw.32 q2, [sp] @ 16-byte Spill
; CHECK-NEXT: vmov.16 q2[0], r2
; CHECK-NEXT: vmov.u8 r2, q5[1]
; CHECK-NEXT: vmov.16 q2[1], r2
; CHECK-NEXT: vmov.u8 r2, q5[2]
; CHECK-NEXT: vmov.16 q2[2], r2
; CHECK-NEXT: vmov.u8 r2, q5[3]
; CHECK-NEXT: vmov.16 q2[3], r2
; CHECK-NEXT: vmov.u8 r2, q5[4]
; CHECK-NEXT: vmov.16 q2[4], r2
; CHECK-NEXT: vmov.u8 r2, q5[5]
; CHECK-NEXT: vmov.16 q2[5], r2
; CHECK-NEXT: vmov.u8 r2, q5[6]
; CHECK-NEXT: vmov.16 q2[6], r2
; CHECK-NEXT: vmov.u8 r2, q5[7]
; CHECK-NEXT: vmov.16 q2[7], r2
; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
; CHECK-NEXT: vcmp.i16 ne, q2, zr
; CHECK-NEXT: vpsel q6, q3, q0
; CHECK-NEXT: vmov.u16 r2, q6[2]
; CHECK-NEXT: vmov.u16 r3, q6[0]
; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
; CHECK-NEXT: vmov.u16 r2, q6[3]
; CHECK-NEXT: vmov.u16 r3, q6[1]
; CHECK-NEXT: vmov q2[3], q2[1], r3, r2
; CHECK-NEXT: vcmp.i32 ne, q2, zr
; CHECK-NEXT: vmov.i64 q2, #0xff
; CHECK-NEXT: vmrs lr, p0
; CHECK-NEXT: and r2, lr, #1
; CHECK-NEXT: ubfx r3, lr, #4, #1
; CHECK-NEXT: rsbs r2, r2, #0
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: vmov q7[2], q7[0], r2, r3
; CHECK-NEXT: vmov q7[3], q7[1], r2, r3
; CHECK-NEXT: vmov.u8 r2, q1[1]
; CHECK-NEXT: vmov.u8 r3, q1[0]
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: vmov.u8 r3, q4[1]
; CHECK-NEXT: vmov.u8 r2, q4[0]
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmov q3[2], q3[0], r2, r3
; CHECK-NEXT: vmov r12, s2
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: vmov r2, s14
; CHECK-NEXT: vmov r4, s12
; CHECK-NEXT: umull r2, r12, r2, r12
; CHECK-NEXT: umull r3, r4, r4, r3
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: vmov q0[3], q0[1], r4, r12
; CHECK-NEXT: vand q0, q0, q7
; CHECK-NEXT: vmov r2, r12, d1
; CHECK-NEXT: vmov r3, r4, d0
; CHECK-NEXT: adds r6, r3, r2
; CHECK-NEXT: ubfx r2, lr, #12, #1
; CHECK-NEXT: adc.w r12, r12, r4
; CHECK-NEXT: ubfx r4, lr, #8, #1
; CHECK-NEXT: rsbs r2, r2, #0
; CHECK-NEXT: rsbs r4, r4, #0
; CHECK-NEXT: vmov q7[2], q7[0], r4, r2
; CHECK-NEXT: vmov.u8 r3, q4[2]
; CHECK-NEXT: vmov q7[3], q7[1], r4, r2
; CHECK-NEXT: vmov.u8 r2, q1[3]
; CHECK-NEXT: vmov.u8 r4, q1[2]
; CHECK-NEXT: vmov q0[2], q0[0], r4, r2
; CHECK-NEXT: vmov.u8 r4, q4[3]
; CHECK-NEXT: vmov q3[2], q3[0], r3, r4
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov r3, s14
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: vmov r5, s12
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: umull r5, r4, r5, r4
; CHECK-NEXT: vmov q0[2], q0[0], r5, r2
; CHECK-NEXT: vmov q0[3], q0[1], r4, r3
; CHECK-NEXT: vmov.u8 r4, q4[4]
; CHECK-NEXT: vand q0, q0, q7
; CHECK-NEXT: vmov q7, q4
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: adds r2, r2, r6
; CHECK-NEXT: vmov r6, r5, d1
; CHECK-NEXT: adc.w r3, r3, r12
; CHECK-NEXT: adds.w r12, r2, r6
; CHECK-NEXT: vmov.u16 r2, q6[6]
; CHECK-NEXT: vmov.u16 r6, q6[4]
; CHECK-NEXT: adc.w lr, r3, r5
; CHECK-NEXT: vmov q0[2], q0[0], r6, r2
; CHECK-NEXT: vmov.u16 r2, q6[7]
; CHECK-NEXT: vmov.u16 r6, q6[5]
; CHECK-NEXT: vmov q0[3], q0[1], r6, r2
; CHECK-NEXT: vcmp.i32 ne, q0, zr
; CHECK-NEXT: vmrs r2, p0
; CHECK-NEXT: and r5, r2, #1
; CHECK-NEXT: ubfx r6, r2, #4, #1
; CHECK-NEXT: rsbs r5, r5, #0
; CHECK-NEXT: rsbs r6, r6, #0
; CHECK-NEXT: vmov q6[2], q6[0], r5, r6
; CHECK-NEXT: vmov q6[3], q6[1], r5, r6
; CHECK-NEXT: vmov.u8 r6, q1[5]
; CHECK-NEXT: vmov.u8 r5, q1[4]
; CHECK-NEXT: vmov q0[2], q0[0], r5, r6
; CHECK-NEXT: vmov.u8 r5, q4[5]
; CHECK-NEXT: vmov q3[2], q3[0], r4, r5
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vmov r6, s2
; CHECK-NEXT: vmov r5, s14
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: vmov r3, s12
; CHECK-NEXT: umull r6, r5, r5, r6
; CHECK-NEXT: umull r3, r4, r3, r4
; CHECK-NEXT: vmov q0[2], q0[0], r3, r6
; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
; CHECK-NEXT: vand q0, q0, q6
; CHECK-NEXT: vmov r3, r4, d0
; CHECK-NEXT: adds.w r3, r3, r12
; CHECK-NEXT: adc.w r6, lr, r4
; CHECK-NEXT: vmov r5, r4, d1
; CHECK-NEXT: adds r3, r3, r5
; CHECK-NEXT: vmov.u8 r5, q4[6]
; CHECK-NEXT: adc.w r12, r6, r4
; CHECK-NEXT: ubfx r6, r2, #12, #1
; CHECK-NEXT: ubfx r2, r2, #8, #1
; CHECK-NEXT: rsbs r6, r6, #0
; CHECK-NEXT: rsbs r2, r2, #0
; CHECK-NEXT: vmov q6[2], q6[0], r2, r6
; CHECK-NEXT: vmov q6[3], q6[1], r2, r6
; CHECK-NEXT: vmov.u8 r2, q1[7]
; CHECK-NEXT: vmov.u8 r6, q1[6]
; CHECK-NEXT: vmov q0[2], q0[0], r6, r2
; CHECK-NEXT: vmov.u8 r6, q4[7]
; CHECK-NEXT: vmov q3[2], q3[0], r5, r6
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov r6, s14
; CHECK-NEXT: vmov r4, s12
; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload
; CHECK-NEXT: vmov r5, s0
; CHECK-NEXT: umull r2, r6, r6, r2
; CHECK-NEXT: umull r5, r4, r4, r5
; CHECK-NEXT: vmov q0[2], q0[0], r5, r2
; CHECK-NEXT: vmov q0[3], q0[1], r4, r6
; CHECK-NEXT: vmov.u8 r4, q7[8]
; CHECK-NEXT: vand q0, q0, q6
; CHECK-NEXT: vmov r2, r6, d0
; CHECK-NEXT: adds r2, r2, r3
; CHECK-NEXT: adc.w r3, r12, r6
; CHECK-NEXT: vmov r6, r5, d1
; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
; CHECK-NEXT: adds.w r12, r2, r6
; CHECK-NEXT: vmov.u8 r2, q5[8]
; CHECK-NEXT: vmov.16 q6[0], r2
; CHECK-NEXT: vmov.u8 r2, q5[9]
; CHECK-NEXT: vmov.16 q6[1], r2
; CHECK-NEXT: vmov.u8 r2, q5[10]
; CHECK-NEXT: vmov.16 q6[2], r2
; CHECK-NEXT: vmov.u8 r2, q5[11]
; CHECK-NEXT: vmov.16 q6[3], r2
; CHECK-NEXT: vmov.u8 r2, q5[12]
; CHECK-NEXT: vmov.16 q6[4], r2
; CHECK-NEXT: vmov.u8 r2, q5[13]
; CHECK-NEXT: vmov.16 q6[5], r2
; CHECK-NEXT: vmov.u8 r2, q5[14]
; CHECK-NEXT: vmov.16 q6[6], r2
; CHECK-NEXT: vmov.u8 r2, q5[15]
; CHECK-NEXT: vmov.16 q6[7], r2
; CHECK-NEXT: adc.w lr, r3, r5
; CHECK-NEXT: vcmp.i16 ne, q6, zr
; CHECK-NEXT: vpsel q3, q3, q0
; CHECK-NEXT: vmov.u16 r2, q3[2]
; CHECK-NEXT: vmov.u16 r6, q3[0]
; CHECK-NEXT: vmov q0[2], q0[0], r6, r2
; CHECK-NEXT: vmov.u16 r2, q3[3]
; CHECK-NEXT: vmov.u16 r6, q3[1]
; CHECK-NEXT: vmov q0[3], q0[1], r6, r2
; CHECK-NEXT: vcmp.i32 ne, q0, zr
; CHECK-NEXT: vmrs r2, p0
; CHECK-NEXT: and r5, r2, #1
; CHECK-NEXT: ubfx r6, r2, #4, #1
; CHECK-NEXT: rsbs r5, r5, #0
; CHECK-NEXT: rsbs r6, r6, #0
; CHECK-NEXT: vmov q4[2], q4[0], r5, r6
; CHECK-NEXT: vmov q4[3], q4[1], r5, r6
; CHECK-NEXT: vmov.u8 r6, q1[9]
; CHECK-NEXT: vmov.u8 r5, q1[8]
; CHECK-NEXT: vmov q0[2], q0[0], r5, r6
; CHECK-NEXT: vmov.u8 r5, q7[9]
; CHECK-NEXT: vmov q5[2], q5[0], r4, r5
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vand q5, q5, q2
; CHECK-NEXT: vmov r6, s2
; CHECK-NEXT: vmov r5, s22
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: vmov r3, s20
; CHECK-NEXT: umull r6, r5, r5, r6
; CHECK-NEXT: umull r3, r4, r3, r4
; CHECK-NEXT: vmov q0[2], q0[0], r3, r6
; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
; CHECK-NEXT: vand q0, q0, q4
; CHECK-NEXT: vmov r3, r4, d0
; CHECK-NEXT: adds.w r3, r3, r12
; CHECK-NEXT: adc.w r6, lr, r4
; CHECK-NEXT: vmov r5, r4, d1
; CHECK-NEXT: adds r3, r3, r5
; CHECK-NEXT: vmov.u8 r5, q7[10]
; CHECK-NEXT: adc.w r12, r6, r4
; CHECK-NEXT: ubfx r6, r2, #12, #1
; CHECK-NEXT: ubfx r2, r2, #8, #1
; CHECK-NEXT: rsbs r6, r6, #0
; CHECK-NEXT: rsbs r2, r2, #0
; CHECK-NEXT: vmov q4[2], q4[0], r2, r6
; CHECK-NEXT: vmov q4[3], q4[1], r2, r6
; CHECK-NEXT: vmov.u8 r2, q1[11]
; CHECK-NEXT: vmov.u8 r6, q1[10]
; CHECK-NEXT: vmov q0[2], q0[0], r6, r2
; CHECK-NEXT: vmov.u8 r6, q7[11]
; CHECK-NEXT: vmov q5[2], q5[0], r5, r6
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vand q5, q5, q2
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov r6, s22
; CHECK-NEXT: vmov r5, s0
; CHECK-NEXT: vmov r4, s20
; CHECK-NEXT: umull r2, r6, r6, r2
; CHECK-NEXT: umull r5, r4, r4, r5
; CHECK-NEXT: vmov q0[2], q0[0], r5, r2
; CHECK-NEXT: vmov q0[3], q0[1], r4, r6
; CHECK-NEXT: vmov.u8 r4, q7[12]
; CHECK-NEXT: vand q0, q0, q4
; CHECK-NEXT: vmov r2, r6, d0
; CHECK-NEXT: adds r2, r2, r3
; CHECK-NEXT: adc.w r3, r12, r6
; CHECK-NEXT: vmov r6, r5, d1
; CHECK-NEXT: adds.w r12, r2, r6
; CHECK-NEXT: vmov.u16 r2, q3[6]
; CHECK-NEXT: vmov.u16 r6, q3[4]
; CHECK-NEXT: adc.w lr, r3, r5
; CHECK-NEXT: vmov q0[2], q0[0], r6, r2
; CHECK-NEXT: vmov.u16 r2, q3[7]
; CHECK-NEXT: vmov.u16 r6, q3[5]
; CHECK-NEXT: vmov q0[3], q0[1], r6, r2
; CHECK-NEXT: vcmp.i32 ne, q0, zr
; CHECK-NEXT: vmrs r2, p0
; CHECK-NEXT: and r5, r2, #1
; CHECK-NEXT: ubfx r6, r2, #4, #1
; CHECK-NEXT: rsbs r5, r5, #0
; CHECK-NEXT: rsbs r6, r6, #0
; CHECK-NEXT: vmov q3[2], q3[0], r5, r6
; CHECK-NEXT: vmov q3[3], q3[1], r5, r6
; CHECK-NEXT: vmov.u8 r6, q1[13]
; CHECK-NEXT: vmov.u8 r5, q1[12]
; CHECK-NEXT: vmov q0[2], q0[0], r5, r6
; CHECK-NEXT: vmov.u8 r5, q7[13]
; CHECK-NEXT: vmov q4[2], q4[0], r4, r5
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vand q4, q4, q2
; CHECK-NEXT: vmov r6, s2
; CHECK-NEXT: vmov r5, s18
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: vmov r3, s16
; CHECK-NEXT: umull r6, r5, r5, r6
; CHECK-NEXT: umull r3, r4, r3, r4
; CHECK-NEXT: vmov q0[2], q0[0], r3, r6
; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
; CHECK-NEXT: vand q0, q0, q3
; CHECK-NEXT: vmov r3, r4, d0
; CHECK-NEXT: adds.w r3, r3, r12
; CHECK-NEXT: adc.w r6, lr, r4
; CHECK-NEXT: vmov r5, r4, d1
; CHECK-NEXT: adds r3, r3, r5
; CHECK-NEXT: vmov.u8 r5, q7[14]
; CHECK-NEXT: adc.w r12, r6, r4
; CHECK-NEXT: ubfx r6, r2, #12, #1
; CHECK-NEXT: ubfx r2, r2, #8, #1
; CHECK-NEXT: rsbs r6, r6, #0
; CHECK-NEXT: rsbs r2, r2, #0
; CHECK-NEXT: vmov q3[2], q3[0], r2, r6
; CHECK-NEXT: vmov q3[3], q3[1], r2, r6
; CHECK-NEXT: vmov.u8 r2, q1[15]
; CHECK-NEXT: vmov.u8 r6, q1[14]
; CHECK-NEXT: vmov q0[2], q0[0], r6, r2
; CHECK-NEXT: vmov.u8 r6, q7[15]
; CHECK-NEXT: vmov q1[2], q1[0], r5, r6
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov r6, s6
; CHECK-NEXT: vmov r5, s0
; CHECK-NEXT: vmov r4, s4
; CHECK-NEXT: umull r2, r6, r6, r2
; CHECK-NEXT: umull r5, r4, r4, r5
; CHECK-NEXT: vmov q0[2], q0[0], r5, r2
; CHECK-NEXT: vmov q0[3], q0[1], r4, r6
; CHECK-NEXT: vand q0, q0, q3
; CHECK-NEXT: vmov r2, r6, d0
; CHECK-NEXT: adds r2, r2, r3
; CHECK-NEXT: adc.w r3, r12, r6
; CHECK-NEXT: vmov r6, r5, d1
; CHECK-NEXT: adds r2, r2, r6
; CHECK-NEXT: adcs r3, r5
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: add sp, #32
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: pop {r4, r5, r6, pc}
entry:
%c = icmp eq <16 x i8> %b, zeroinitializer
%xx = zext <16 x i8> %x to <16 x i64>
%yy = zext <16 x i8> %y to <16 x i64>
%m = mul <16 x i64> %xx, %yy
%s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i64 %a) {
; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, lr}
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vcmp.i8 eq, q2, zr
; CHECK-NEXT: vmov.i8 q2, #0x0
; CHECK-NEXT: vmov.i8 q3, #0xff
; CHECK-NEXT: vmov.s8 r4, q0[0]
; CHECK-NEXT: vpsel q4, q3, q2
; CHECK-NEXT: vmov.s8 r5, q0[2]
; CHECK-NEXT: vmov.u8 r2, q4[0]
; CHECK-NEXT: vmov.16 q5[0], r2
; CHECK-NEXT: vmov.u8 r2, q4[1]
; CHECK-NEXT: vmov.16 q5[1], r2
; CHECK-NEXT: vmov.u8 r2, q4[2]
; CHECK-NEXT: vmov.16 q5[2], r2
; CHECK-NEXT: vmov.u8 r2, q4[3]
; CHECK-NEXT: vmov.16 q5[3], r2
; CHECK-NEXT: vmov.u8 r2, q4[4]
; CHECK-NEXT: vmov.16 q5[4], r2
; CHECK-NEXT: vmov.u8 r2, q4[5]
; CHECK-NEXT: vmov.16 q5[5], r2
; CHECK-NEXT: vmov.u8 r2, q4[6]
; CHECK-NEXT: vmov.16 q5[6], r2
; CHECK-NEXT: vmov.u8 r2, q4[7]
; CHECK-NEXT: vmov.16 q5[7], r2
; CHECK-NEXT: vcmp.i16 ne, q5, zr
; CHECK-NEXT: vpsel q5, q3, q2
; CHECK-NEXT: vmov.u16 r2, q5[2]
; CHECK-NEXT: vmov.u16 r3, q5[0]
; CHECK-NEXT: vmov q6[2], q6[0], r3, r2
; CHECK-NEXT: vmov.u16 r2, q5[3]
; CHECK-NEXT: vmov.u16 r3, q5[1]
; CHECK-NEXT: vmov q6[3], q6[1], r3, r2
; CHECK-NEXT: vcmp.i32 ne, q6, zr
; CHECK-NEXT: vmrs r12, p0
; CHECK-NEXT: and r2, r12, #1
; CHECK-NEXT: ubfx r3, r12, #4, #1
; CHECK-NEXT: rsbs r2, r2, #0
; CHECK-NEXT: rsbs r3, r3, #0
; CHECK-NEXT: vmov q6[2], q6[0], r2, r3
; CHECK-NEXT: vmov q6[3], q6[1], r2, r3
; CHECK-NEXT: vmov.s8 r2, q1[1]
; CHECK-NEXT: vmov.s8 r3, q0[1]
; CHECK-NEXT: smull r2, lr, r3, r2
; CHECK-NEXT: vmov.s8 r3, q1[0]
; CHECK-NEXT: smull r3, r4, r4, r3
; CHECK-NEXT: vmov q7[2], q7[0], r3, r2
; CHECK-NEXT: vmov q7[3], q7[1], r4, lr
; CHECK-NEXT: vand q6, q7, q6
; CHECK-NEXT: vmov r2, lr, d13
; CHECK-NEXT: vmov r4, r3, d12
; CHECK-NEXT: adds r6, r4, r2
; CHECK-NEXT: ubfx r4, r12, #12, #1
; CHECK-NEXT: ubfx r2, r12, #8, #1
; CHECK-NEXT: rsb.w r4, r4, #0
; CHECK-NEXT: rsb.w r2, r2, #0
; CHECK-NEXT: adc.w lr, lr, r3
; CHECK-NEXT: vmov q6[2], q6[0], r2, r4
; CHECK-NEXT: vmov.s8 r3, q1[2]
; CHECK-NEXT: vmov q6[3], q6[1], r2, r4
; CHECK-NEXT: vmov.s8 r2, q1[3]
; CHECK-NEXT: vmov.s8 r4, q0[3]
; CHECK-NEXT: smull r3, r5, r5, r3
; CHECK-NEXT: smull r2, r4, r4, r2
; CHECK-NEXT: vmov q7[2], q7[0], r3, r2
; CHECK-NEXT: vmov q7[3], q7[1], r5, r4
; CHECK-NEXT: vand q6, q7, q6
; CHECK-NEXT: vmov r2, r3, d12
; CHECK-NEXT: adds r2, r2, r6
; CHECK-NEXT: vmov r6, r5, d13
; CHECK-NEXT: adc.w r3, r3, lr
; CHECK-NEXT: adds.w r12, r2, r6
; CHECK-NEXT: vmov.u16 r6, q5[6]
; CHECK-NEXT: adc.w lr, r3, r5
; CHECK-NEXT: vmov.u16 r5, q5[4]
; CHECK-NEXT: vmov q6[2], q6[0], r5, r6
; CHECK-NEXT: vmov.u16 r6, q5[7]
; CHECK-NEXT: vmov.u16 r5, q5[5]
; CHECK-NEXT: vmov.s8 r2, q1[4]
; CHECK-NEXT: vmov q6[3], q6[1], r5, r6
; CHECK-NEXT: vmov.s8 r3, q0[4]
; CHECK-NEXT: vcmp.i32 ne, q6, zr
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmrs r6, p0
; CHECK-NEXT: and r4, r6, #1
; CHECK-NEXT: ubfx r5, r6, #4, #1
; CHECK-NEXT: rsbs r4, r4, #0
; CHECK-NEXT: rsbs r5, r5, #0
; CHECK-NEXT: vmov q5[2], q5[0], r4, r5
; CHECK-NEXT: vmov q5[3], q5[1], r4, r5
; CHECK-NEXT: vmov.s8 r5, q1[5]
; CHECK-NEXT: vmov.s8 r4, q0[5]
; CHECK-NEXT: smull r5, r4, r4, r5
; CHECK-NEXT: vmov q6[2], q6[0], r2, r5
; CHECK-NEXT: vmov q6[3], q6[1], r3, r4
; CHECK-NEXT: vand q5, q6, q5
; CHECK-NEXT: vmov r2, r3, d10
; CHECK-NEXT: vmov r5, r4, d11
; CHECK-NEXT: adds.w r2, r2, r12
; CHECK-NEXT: adc.w r3, r3, lr
; CHECK-NEXT: adds.w r12, r2, r5
; CHECK-NEXT: ubfx r5, r6, #12, #1
; CHECK-NEXT: ubfx r6, r6, #8, #1
; CHECK-NEXT: rsb.w r5, r5, #0
; CHECK-NEXT: rsb.w r6, r6, #0
; CHECK-NEXT: vmov q5[2], q5[0], r6, r5
; CHECK-NEXT: adcs r3, r4
; CHECK-NEXT: vmov q5[3], q5[1], r6, r5
; CHECK-NEXT: vmov.s8 r6, q1[7]
; CHECK-NEXT: vmov.s8 r5, q0[7]
; CHECK-NEXT: vmov.s8 r4, q1[6]
; CHECK-NEXT: vmov.s8 r2, q0[6]
; CHECK-NEXT: smull r6, r5, r5, r6
; CHECK-NEXT: smull r2, r4, r2, r4
; CHECK-NEXT: vmov q6[2], q6[0], r2, r6
; CHECK-NEXT: vmov q6[3], q6[1], r4, r5
; CHECK-NEXT: vand q5, q6, q5
; CHECK-NEXT: vmov r2, r6, d10
; CHECK-NEXT: adds.w r2, r2, r12
; CHECK-NEXT: adcs r3, r6
; CHECK-NEXT: vmov r6, r5, d11
; CHECK-NEXT: adds.w r12, r2, r6
; CHECK-NEXT: vmov.u8 r6, q4[8]
; CHECK-NEXT: vmov.16 q5[0], r6
; CHECK-NEXT: vmov.u8 r6, q4[9]
; CHECK-NEXT: vmov.16 q5[1], r6
; CHECK-NEXT: vmov.u8 r6, q4[10]
; CHECK-NEXT: vmov.16 q5[2], r6
; CHECK-NEXT: vmov.u8 r6, q4[11]
; CHECK-NEXT: vmov.16 q5[3], r6
; CHECK-NEXT: vmov.u8 r6, q4[12]
; CHECK-NEXT: vmov.16 q5[4], r6
; CHECK-NEXT: vmov.u8 r6, q4[13]
; CHECK-NEXT: vmov.16 q5[5], r6
; CHECK-NEXT: vmov.u8 r6, q4[14]
; CHECK-NEXT: vmov.16 q5[6], r6
; CHECK-NEXT: vmov.u8 r6, q4[15]
; CHECK-NEXT: vmov.16 q5[7], r6
; CHECK-NEXT: adc.w lr, r3, r5
; CHECK-NEXT: vcmp.i16 ne, q5, zr
; CHECK-NEXT: vmov.s8 r2, q1[8]
; CHECK-NEXT: vpsel q2, q3, q2
; CHECK-NEXT: vmov.s8 r3, q0[8]
; CHECK-NEXT: vmov.u16 r6, q2[2]
; CHECK-NEXT: vmov.u16 r5, q2[0]
; CHECK-NEXT: vmov q3[2], q3[0], r5, r6
; CHECK-NEXT: vmov.u16 r6, q2[3]
; CHECK-NEXT: vmov.u16 r5, q2[1]
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmov q3[3], q3[1], r5, r6
; CHECK-NEXT: vcmp.i32 ne, q3, zr
; CHECK-NEXT: vmrs r6, p0
; CHECK-NEXT: and r4, r6, #1
; CHECK-NEXT: ubfx r5, r6, #4, #1
; CHECK-NEXT: rsbs r4, r4, #0
; CHECK-NEXT: rsbs r5, r5, #0
; CHECK-NEXT: vmov q3[2], q3[0], r4, r5
; CHECK-NEXT: vmov q3[3], q3[1], r4, r5
; CHECK-NEXT: vmov.s8 r5, q1[9]
; CHECK-NEXT: vmov.s8 r4, q0[9]
; CHECK-NEXT: smull r5, r4, r4, r5
; CHECK-NEXT: vmov q4[2], q4[0], r2, r5
; CHECK-NEXT: vmov q4[3], q4[1], r3, r4
; CHECK-NEXT: vand q3, q4, q3
; CHECK-NEXT: vmov r2, r3, d6
; CHECK-NEXT: vmov r5, r4, d7
; CHECK-NEXT: adds.w r2, r2, r12
; CHECK-NEXT: adc.w r3, r3, lr
; CHECK-NEXT: adds.w r12, r2, r5
; CHECK-NEXT: ubfx r5, r6, #12, #1
; CHECK-NEXT: ubfx r6, r6, #8, #1
; CHECK-NEXT: rsb.w r5, r5, #0
; CHECK-NEXT: rsb.w r6, r6, #0
; CHECK-NEXT: vmov q3[2], q3[0], r6, r5
; CHECK-NEXT: adcs r3, r4
; CHECK-NEXT: vmov q3[3], q3[1], r6, r5
; CHECK-NEXT: vmov.s8 r6, q1[11]
; CHECK-NEXT: vmov.s8 r5, q0[11]
; CHECK-NEXT: vmov.s8 r4, q1[10]
; CHECK-NEXT: vmov.s8 r2, q0[10]
; CHECK-NEXT: smull r6, r5, r5, r6
; CHECK-NEXT: smull r2, r4, r2, r4
; CHECK-NEXT: vmov q4[2], q4[0], r2, r6
; CHECK-NEXT: vmov q4[3], q4[1], r4, r5
; CHECK-NEXT: vand q3, q4, q3
; CHECK-NEXT: vmov r2, r6, d6
; CHECK-NEXT: adds.w r2, r2, r12
; CHECK-NEXT: adcs r3, r6
; CHECK-NEXT: vmov r6, r5, d7
; CHECK-NEXT: adds.w r12, r2, r6
; CHECK-NEXT: vmov.u16 r6, q2[6]
; CHECK-NEXT: adc.w lr, r3, r5
; CHECK-NEXT: vmov.u16 r5, q2[4]
; CHECK-NEXT: vmov q3[2], q3[0], r5, r6
; CHECK-NEXT: vmov.u16 r6, q2[7]
; CHECK-NEXT: vmov.u16 r5, q2[5]
; CHECK-NEXT: vmov.s8 r2, q1[12]
; CHECK-NEXT: vmov q3[3], q3[1], r5, r6
; CHECK-NEXT: vmov.s8 r3, q0[12]
; CHECK-NEXT: vcmp.i32 ne, q3, zr
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmrs r6, p0
; CHECK-NEXT: and r4, r6, #1
; CHECK-NEXT: ubfx r5, r6, #4, #1
; CHECK-NEXT: rsbs r4, r4, #0
; CHECK-NEXT: rsbs r5, r5, #0
; CHECK-NEXT: vmov q2[2], q2[0], r4, r5
; CHECK-NEXT: vmov q2[3], q2[1], r4, r5
; CHECK-NEXT: vmov.s8 r5, q1[13]
; CHECK-NEXT: vmov.s8 r4, q0[13]
; CHECK-NEXT: smull r5, r4, r4, r5
; CHECK-NEXT: vmov q3[2], q3[0], r2, r5
; CHECK-NEXT: vmov q3[3], q3[1], r3, r4
; CHECK-NEXT: vand q2, q3, q2
; CHECK-NEXT: vmov r2, r3, d4
; CHECK-NEXT: vmov r5, r4, d5
; CHECK-NEXT: adds.w r2, r2, r12
; CHECK-NEXT: adc.w r3, r3, lr
; CHECK-NEXT: adds.w r12, r2, r5
; CHECK-NEXT: ubfx r5, r6, #12, #1
; CHECK-NEXT: ubfx r6, r6, #8, #1
; CHECK-NEXT: rsb.w r5, r5, #0
; CHECK-NEXT: rsb.w r6, r6, #0
; CHECK-NEXT: vmov q2[2], q2[0], r6, r5
; CHECK-NEXT: adcs r3, r4
; CHECK-NEXT: vmov q2[3], q2[1], r6, r5
; CHECK-NEXT: vmov.s8 r6, q1[15]
; CHECK-NEXT: vmov.s8 r5, q0[15]
; CHECK-NEXT: vmov.s8 r4, q1[14]
; CHECK-NEXT: vmov.s8 r2, q0[14]
; CHECK-NEXT: smull r6, r5, r5, r6
; CHECK-NEXT: smull r2, r4, r2, r4
; CHECK-NEXT: vmov q0[2], q0[0], r2, r6
; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmov r2, r6, d0
; CHECK-NEXT: adds.w r2, r2, r12
; CHECK-NEXT: adcs r3, r6
; CHECK-NEXT: vmov r6, r5, d1
; CHECK-NEXT: adds r2, r2, r6
; CHECK-NEXT: adcs r3, r5
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: pop {r4, r5, r6, pc}
entry:
%c = icmp eq <16 x i8> %b, zeroinitializer
%xx = sext <16 x i8> %x to <16 x i64>
%yy = sext <16 x i8> %y to <16 x i64>
%m = mul <16 x i64> %xx, %yy
%s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b, i64 %a) {
; CHECK-LABEL: add_v2i8_v2i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov.i64 q3, #0xff
; CHECK-NEXT: vand q1, q1, q3
; CHECK-NEXT: vand q0, q0, q3
; CHECK-NEXT: vmov r2, s6
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: umull lr, r12, r3, r2
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: vand q1, q2, q3
; CHECK-NEXT: umull r2, r3, r2, r3
; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
; CHECK-NEXT: vmov r2, s6
; CHECK-NEXT: vmov q0[3], q0[1], r3, r12
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: cset r2, eq
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: csetm r2, ne
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: cset r3, eq
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: csetm r3, ne
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vmov lr, r12, d1
; CHECK-NEXT: vmov r3, r2, d0
; CHECK-NEXT: adds.w r3, r3, lr
; CHECK-NEXT: adc.w r2, r2, r12
; CHECK-NEXT: adds r0, r0, r3
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%c = icmp eq <2 x i8> %b, zeroinitializer
%xx = zext <2 x i8> %x to <2 x i64>
%yy = zext <2 x i8> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b, i64 %a) {
; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov.i32 q3, #0xff
; CHECK-NEXT: vand q2, q2, q3
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: vmov r3, s8
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: cset r2, eq
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: csetm r2, ne
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: cset r3, eq
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: csetm r3, ne
; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
; CHECK-NEXT: vmov q2[3], q2[1], r3, r2
; CHECK-NEXT: vmov r2, s6
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smull lr, r12, r3, r2
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: smull r2, r3, r2, r3
; CHECK-NEXT: vmov q0[2], q0[0], r2, lr
; CHECK-NEXT: vmov q0[3], q0[1], r3, r12
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmov lr, r12, d1
; CHECK-NEXT: vmov r3, r2, d0
; CHECK-NEXT: adds.w r3, r3, lr
; CHECK-NEXT: adc.w r2, r2, r12
; CHECK-NEXT: adds r0, r0, r3
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%c = icmp eq <2 x i8> %b, zeroinitializer
%xx = sext <2 x i8> %x to <2 x i64>
%yy = sext <2 x i8> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b, i64 %a) {
; CHECK-LABEL: add_v2i64_v2i64_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: vmov r2, r12, d3
; CHECK-NEXT: vmov r3, lr, d1
; CHECK-NEXT: vmov r6, r9, d2
; CHECK-NEXT: vmov r5, r11, d0
; CHECK-NEXT: umull r10, r8, r3, r2
; CHECK-NEXT: umull r4, r7, r5, r6
; CHECK-NEXT: mla r3, r3, r12, r8
; CHECK-NEXT: vmov q0[2], q0[0], r4, r10
; CHECK-NEXT: mla r2, lr, r2, r3
; CHECK-NEXT: mla r3, r5, r9, r7
; CHECK-NEXT: mla r3, r11, r6, r3
; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
; CHECK-NEXT: vmov r2, r3, d5
; CHECK-NEXT: orrs r2, r3
; CHECK-NEXT: vmov r3, r7, d4
; CHECK-NEXT: cset r2, eq
; CHECK-NEXT: cmp r2, #0
; CHECK-NEXT: csetm r2, ne
; CHECK-NEXT: orrs r3, r7
; CHECK-NEXT: cset r3, eq
; CHECK-NEXT: cmp r3, #0
; CHECK-NEXT: csetm r3, ne
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: vmov r7, r6, d0
; CHECK-NEXT: adds r2, r2, r7
; CHECK-NEXT: adcs r3, r6
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
%c = icmp eq <2 x i64> %b, zeroinitializer
%m = mul <2 x i64> %x, %y
%s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
%r = add i64 %z, %a
ret i64 %r
}
declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)