1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-25 12:12:47 +01:00
llvm-mirror/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
David Green 48eaab481d [ARM] Extend more reductions during lowering
This relaxes the VMLAV and VADDV reduction recognition code to handle
smaller than legal types, extending them as needed. That was already
handled for some reductions, this extends it to more types in a more
generic way. If a smaller than legal value is found it is extended to
the legal type as needed.

Differential Revision: https://reviews.llvm.org/D106051
2021-07-19 08:58:03 +01:00

1536 lines
48 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
define arm_aapcs_vfpcc i32 @add_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: add_v4i32_v4i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlav.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%m = mul <4 x i32> %x, %y
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: add_v4i32_v4i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlalv.u32 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i32> %x to <4 x i64>
%yy = zext <4 x i32> %y to <4 x i64>
%m = mul <4 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: add_v4i32_v4i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlalv.s32 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i32> %x to <4 x i64>
%yy = sext <4 x i32> %y to <4 x i64>
%m = mul <4 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
; CHECK-LABEL: add_v2i32_v2i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmullb.u32 q2, q0, q1
; CHECK-NEXT: vmov r0, r1, d5
; CHECK-NEXT: vmov r2, r3, d4
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: bx lr
entry:
%xx = zext <2 x i32> %x to <2 x i64>
%yy = zext <2 x i32> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) {
; CHECK-LABEL: add_v2i32_v2i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmullb.s32 q2, q0, q1
; CHECK-NEXT: vmov r0, r1, d5
; CHECK-NEXT: vmov r2, r3, d4
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: bx lr
entry:
%xx = sext <2 x i32> %x to <2 x i64>
%yy = sext <2 x i32> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: add_v8i16_v8i32_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlav.u16 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i16> %x to <8 x i32>
%yy = zext <8 x i16> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: add_v8i16_v8i32_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlav.s16 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i16> %x to <8 x i32>
%yy = sext <8 x i16> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-LABEL: add_v4i16_v4i32_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u16 q1, q1
; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vmlav.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i16> %x to <4 x i32>
%yy = zext <4 x i16> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-LABEL: add_v4i16_v4i32_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s16 q1, q1
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmlav.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i16> %x to <4 x i32>
%yy = sext <4 x i16> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: add_v8i16_v8i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlav.u16 r0, q0, q1
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%m = mul <8 x i16> %x, %y
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
ret i16 %z
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: add_v8i16_v8i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlalv.u16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i16> %x to <8 x i64>
%yy = zext <8 x i16> %y to <8 x i64>
%m = mul <8 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: add_v8i16_v8i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlalv.s16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i16> %x to <8 x i64>
%yy = sext <8 x i16> %y to <8 x i64>
%m = mul <8 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v8i8i16_v8i64_zext(<8 x i16> %x, <8 x i8> %y) {
; CHECK-LABEL: add_v8i8i16_v8i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u8 q1, q1
; CHECK-NEXT: vmlalv.u16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i16> %x to <8 x i64>
%yy = zext <8 x i8> %y to <8 x i64>
%m = mul <8 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v8i8i16_v8i64_sext(<8 x i16> %x, <8 x i8> %y) {
; CHECK-LABEL: add_v8i8i16_v8i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q1, q1
; CHECK-NEXT: vmlalv.s16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i16> %x to <8 x i64>
%yy = sext <8 x i8> %y to <8 x i64>
%m = mul <8 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-LABEL: add_v4i16_v4i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u16 q1, q1
; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vmlalv.u32 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i16> %x to <4 x i64>
%yy = zext <4 x i16> %y to <4 x i64>
%m = mul <4 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-LABEL: add_v4i16_v4i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s16 q1, q1
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmlalv.s32 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i16> %x to <4 x i64>
%yy = sext <4 x i16> %y to <4 x i64>
%m = mul <4 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: add_v8i16_v8i32_v8i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlalv.u16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i16> %x to <8 x i32>
%yy = zext <8 x i16> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%ma = zext <8 x i32> %m to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: add_v8i16_v8i32_v8i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlalv.s16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i16> %x to <8 x i32>
%yy = sext <8 x i16> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%ma = sext <8 x i32> %m to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sextzext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: add_v8i16_v8i32_v8i64_sextzext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlalv.s16 r0, r1, q0, q0
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i16> %x to <8 x i32>
%m = mul <8 x i32> %xx, %xx
%ma = zext <8 x i32> %m to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
; CHECK-LABEL: add_v2i16_v2i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i64 q2, #0xffff
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmov r0, s6
; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: vmov r2, s4
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: umull r0, r1, r1, r0
; CHECK-NEXT: umlal r0, r1, r3, r2
; CHECK-NEXT: bx lr
entry:
%xx = zext <2 x i16> %x to <2 x i64>
%yy = zext <2 x i16> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) {
; CHECK-LABEL: add_v2i16_v2i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r0, s6
; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: vmov r2, s4
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: sxth r1, r1
; CHECK-NEXT: smull r0, r1, r1, r0
; CHECK-NEXT: sxth r2, r2
; CHECK-NEXT: sxth r3, r3
; CHECK-NEXT: smlal r0, r1, r3, r2
; CHECK-NEXT: bx lr
entry:
%xx = sext <2 x i16> %x to <2 x i64>
%yy = sext <2 x i16> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_v16i8_v16i32_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlav.u8 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <16 x i8> %x to <16 x i32>
%yy = zext <16 x i8> %y to <16 x i32>
%m = mul <16 x i32> %xx, %yy
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_v16i8_v16i32_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlav.s8 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <16 x i8> %x to <16 x i32>
%yy = sext <16 x i8> %y to <16 x i32>
%m = mul <16 x i32> %xx, %yy
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-LABEL: add_v8i8_v8i32_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u8 q1, q1
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: vmlav.u16 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i8> %x to <8 x i32>
%yy = zext <8 x i8> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-LABEL: add_v8i8_v8i32_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q1, q1
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmlav.s16 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i8> %x to <8 x i32>
%yy = sext <8 x i8> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v8i8i16_v8i32_zext(<8 x i8> %x, <8 x i16> %y) {
; CHECK-LABEL: add_v8i8i16_v8i32_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: vmlav.u16 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i8> %x to <8 x i32>
%yy = zext <8 x i16> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v8i8i16_v8i32_sext(<8 x i8> %x, <8 x i16> %y) {
; CHECK-LABEL: add_v8i8i16_v8i32_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmlav.s16 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i8> %x to <8 x i32>
%yy = sext <8 x i16> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_v16i8_v16i16_v16i32_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlav.u8 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <16 x i8> %x to <16 x i16>
%yy = zext <16 x i8> %y to <16 x i16>
%m = mul <16 x i16> %xx, %yy
%ma = zext <16 x i16> %m to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_v16i8_v16i16_v16i32_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlav.s8 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%yy = sext <16 x i8> %y to <16 x i16>
%m = mul <16 x i16> %xx, %yy
%ma = sext <16 x i16> %m to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sextzext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_v16i8_v16i16_v16i32_sextzext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlav.s8 r0, q0, q0
; CHECK-NEXT: bx lr
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%m = mul <16 x i16> %xx, %xx
%ma = zext <16 x i16> %m to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: add_v4i8_v4i32_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i32 q2, #0xff
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmlav.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i8> %x to <4 x i32>
%yy = zext <4 x i8> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: add_v4i8_v4i32_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q1, q1
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmovlb.s16 q1, q1
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmlav.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i8> %x to <4 x i32>
%yy = sext <4 x i8> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_szext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: add_v4i8_v4i32_szext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmov.i32 q2, #0xff
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmlav.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i8> %x to <4 x i32>
%yy = zext <4 x i8> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_v16i8_v16i16_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlav.u8 r0, q0, q1
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = zext <16 x i8> %x to <16 x i16>
%yy = zext <16 x i8> %y to <16 x i16>
%m = mul <16 x i16> %xx, %yy
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
ret i16 %z
}
define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_v16i8_v16i16_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlav.s8 r0, q0, q1
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%yy = sext <16 x i8> %y to <16 x i16>
%m = mul <16 x i16> %xx, %yy
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
ret i16 %z
}
define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_szext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_v16i8_v16i16_szext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #32
; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: add r0, sp, #16
; CHECK-NEXT: mov r1, sp
; CHECK-NEXT: vstrw.32 q1, [r0]
; CHECK-NEXT: vstrw.32 q0, [r1]
; CHECK-NEXT: vldrb.u16 q0, [r0, #8]
; CHECK-NEXT: vldrb.s16 q1, [r1, #8]
; CHECK-NEXT: vldrb.s16 q2, [r1]
; CHECK-NEXT: vmul.i16 q0, q1, q0
; CHECK-NEXT: vldrb.u16 q1, [r0]
; CHECK-NEXT: vmul.i16 q1, q2, q1
; CHECK-NEXT: vadd.i16 q0, q1, q0
; CHECK-NEXT: vaddv.u16 r0, q0
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: add sp, #32
; CHECK-NEXT: bx lr
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%yy = zext <16 x i8> %y to <16 x i16>
%m = mul <16 x i16> %xx, %yy
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
ret i16 %z
}
define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-LABEL: add_v8i8_v8i16_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u8 q1, q1
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: vmlav.u16 r0, q0, q1
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i8> %x to <8 x i16>
%yy = zext <8 x i8> %y to <8 x i16>
%m = mul <8 x i16> %xx, %yy
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
ret i16 %z
}
define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-LABEL: add_v8i8_v8i16_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q1, q1
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmlav.u16 r0, q0, q1
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i8> %x to <8 x i16>
%yy = sext <8 x i8> %y to <8 x i16>
%m = mul <8 x i16> %xx, %yy
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
ret i16 %z
}
define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_v16i8_v16i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlav.u8 r0, q0, q1
; CHECK-NEXT: uxtb r0, r0
; CHECK-NEXT: bx lr
entry:
%m = mul <16 x i8> %x, %y
%z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %m)
ret i8 %z
}
define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_v16i8_v16i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #32
; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: add r2, sp, #16
; CHECK-NEXT: mov r3, sp
; CHECK-NEXT: vstrw.32 q1, [r2]
; CHECK-NEXT: vstrw.32 q0, [r3]
; CHECK-NEXT: vldrb.u16 q0, [r2]
; CHECK-NEXT: vldrb.u16 q1, [r3]
; CHECK-NEXT: vmlalv.u16 r0, r1, q1, q0
; CHECK-NEXT: vldrb.u16 q0, [r2, #8]
; CHECK-NEXT: vldrb.u16 q1, [r3, #8]
; CHECK-NEXT: vmlalva.u16 r0, r1, q1, q0
; CHECK-NEXT: add sp, #32
; CHECK-NEXT: bx lr
entry:
%xx = zext <16 x i8> %x to <16 x i64>
%yy = zext <16 x i8> %y to <16 x i64>
%m = mul <16 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_v16i8_v16i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #32
; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: add r2, sp, #16
; CHECK-NEXT: mov r3, sp
; CHECK-NEXT: vstrw.32 q1, [r2]
; CHECK-NEXT: vstrw.32 q0, [r3]
; CHECK-NEXT: vldrb.s16 q0, [r2]
; CHECK-NEXT: vldrb.s16 q1, [r3]
; CHECK-NEXT: vmlalv.s16 r0, r1, q1, q0
; CHECK-NEXT: vldrb.s16 q0, [r2, #8]
; CHECK-NEXT: vldrb.s16 q1, [r3, #8]
; CHECK-NEXT: vmlalva.s16 r0, r1, q1, q0
; CHECK-NEXT: add sp, #32
; CHECK-NEXT: bx lr
entry:
%xx = sext <16 x i8> %x to <16 x i64>
%yy = sext <16 x i8> %y to <16 x i64>
%m = mul <16 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext_load(<16 x i8> *%xp, <16 x i8> *%yp) {
; CHECK-LABEL: add_v16i8_v16i64_zext_load:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u16 q0, [r1]
; CHECK-NEXT: vldrb.u16 q1, [r0]
; CHECK-NEXT: vmlalv.u16 r2, r3, q1, q0
; CHECK-NEXT: vldrb.u16 q0, [r1, #8]
; CHECK-NEXT: vldrb.u16 q1, [r0, #8]
; CHECK-NEXT: vmlalva.u16 r2, r3, q1, q0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: mov r1, r3
; CHECK-NEXT: bx lr
entry:
%x = load <16 x i8>, <16 x i8>* %xp
%y = load <16 x i8>, <16 x i8>* %yp
%xx = zext <16 x i8> %x to <16 x i64>
%yy = zext <16 x i8> %y to <16 x i64>
%m = mul <16 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext_load(<16 x i8> *%xp, <16 x i8> *%yp) {
; CHECK-LABEL: add_v16i8_v16i64_sext_load:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.s16 q0, [r1]
; CHECK-NEXT: vldrb.s16 q1, [r0]
; CHECK-NEXT: vmlalv.s16 r2, r3, q1, q0
; CHECK-NEXT: vldrb.s16 q0, [r1, #8]
; CHECK-NEXT: vldrb.s16 q1, [r0, #8]
; CHECK-NEXT: vmlalva.s16 r2, r3, q1, q0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: mov r1, r3
; CHECK-NEXT: bx lr
entry:
%x = load <16 x i8>, <16 x i8>* %xp
%y = load <16 x i8>, <16 x i8>* %yp
%xx = sext <16 x i8> %x to <16 x i64>
%yy = sext <16 x i8> %y to <16 x i64>
%m = mul <16 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-LABEL: add_v8i8_v8i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u8 q1, q1
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: vmlalv.u16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i8> %x to <8 x i64>
%yy = zext <8 x i8> %y to <8 x i64>
%m = mul <8 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-LABEL: add_v8i8_v8i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q1, q1
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmlalv.s16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i8> %x to <8 x i64>
%yy = sext <8 x i8> %y to <8 x i64>
%m = mul <8 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: add_v4i8_v4i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i32 q2, #0xff
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmlalv.u32 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i8> %x to <4 x i64>
%yy = zext <4 x i8> %y to <4 x i64>
%m = mul <4 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: add_v4i8_v4i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q1, q1
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmovlb.s16 q1, q1
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmlalv.s32 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i8> %x to <4 x i64>
%yy = sext <4 x i8> %y to <4 x i64>
%m = mul <4 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i64_zext(<4 x i8> %x, <4 x i16> %y) {
; CHECK-LABEL: add_v4i8i16_v4i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i32 q2, #0xff
; CHECK-NEXT: vmovlb.u16 q1, q1
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmlalv.u32 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i8> %x to <4 x i64>
%yy = zext <4 x i16> %y to <4 x i64>
%m = mul <4 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i64_sext(<4 x i8> %x, <4 x i16> %y) {
; CHECK-LABEL: add_v4i8i16_v4i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmovlb.s16 q1, q1
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmlalv.s32 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i8> %x to <4 x i64>
%yy = sext <4 x i16> %y to <4 x i64>
%m = mul <4 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i32_v4i64_zext(<4 x i8> %x, <4 x i16> %y) {
; CHECK-LABEL: add_v4i8i16_v4i32_v4i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i32 q2, #0xff
; CHECK-NEXT: vmovlb.u16 q1, q1
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmul.i32 q0, q0, q1
; CHECK-NEXT: vaddlv.u32 r0, r1, q0
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i8> %x to <4 x i32>
%yy = zext <4 x i16> %y to <4 x i32>
%mm = mul <4 x i32> %xx, %yy
%m = zext <4 x i32> %mm to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i32_v4i64_sext(<4 x i8> %x, <4 x i16> %y) {
; CHECK-LABEL: add_v4i8i16_v4i32_v4i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmovlb.s16 q1, q1
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmul.i32 q0, q0, q1
; CHECK-NEXT: vaddlv.s32 r0, r1, q0
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i8> %x to <4 x i32>
%yy = sext <4 x i16> %y to <4 x i32>
%mm = mul <4 x i32> %xx, %yy
%m = sext <4 x i32> %mm to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
; CHECK-LABEL: add_v2i8_v2i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i64 q2, #0xff
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmov r0, s6
; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: vmov r2, s4
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: umull r0, r1, r1, r0
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: orrs r1, r3
; CHECK-NEXT: bx lr
entry:
%xx = zext <2 x i8> %x to <2 x i64>
%yy = zext <2 x i8> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) {
; CHECK-LABEL: add_v2i8_v2i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r0, s6
; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: vmov r2, s4
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: sxtb r0, r0
; CHECK-NEXT: sxtb r1, r1
; CHECK-NEXT: smull r0, r1, r1, r0
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smlal r0, r1, r3, r2
; CHECK-NEXT: bx lr
entry:
%xx = sext <2 x i8> %x to <2 x i64>
%yy = sext <2 x i8> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) {
; CHECK-LABEL: add_v2i64_v2i64:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r7, lr}
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: vmov r0, lr, d3
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: umull r12, r1, r2, r0
; CHECK-NEXT: mla r1, r2, lr, r1
; CHECK-NEXT: mla lr, r3, r0, r1
; CHECK-NEXT: vmov r0, r2, d2
; CHECK-NEXT: vmov r3, r1, d0
; CHECK-NEXT: umull r4, r5, r3, r0
; CHECK-NEXT: mla r2, r3, r2, r5
; CHECK-NEXT: mla r1, r1, r0, r2
; CHECK-NEXT: adds.w r0, r4, r12
; CHECK-NEXT: adc.w r1, r1, lr
; CHECK-NEXT: pop {r4, r5, r7, pc}
entry:
%m = mul <2 x i64> %x, %y
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i32 @add_v4i32_v4i32_acc(<4 x i32> %x, <4 x i32> %y, i32 %a) {
; CHECK-LABEL: add_v4i32_v4i32_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlava.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%m = mul <4 x i32> %x, %y
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, <4 x i32> %y, i64 %a) {
; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlalva.u32 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i32> %x to <4 x i64>
%yy = zext <4 x i32> %y to <4 x i64>
%m = mul <4 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, <4 x i32> %y, i64 %a) {
; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlalva.s32 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i32> %x to <4 x i64>
%yy = sext <4 x i32> %y to <4 x i64>
%m = mul <4 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %y, i64 %a) {
; CHECK-LABEL: add_v2i32_v2i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmullb.u32 q2, q0, q1
; CHECK-NEXT: vmov lr, r12, d5
; CHECK-NEXT: vmov r3, r2, d4
; CHECK-NEXT: adds.w r3, r3, lr
; CHECK-NEXT: adc.w r2, r2, r12
; CHECK-NEXT: adds r0, r0, r3
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = zext <2 x i32> %x to <2 x i64>
%yy = zext <2 x i32> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %y, i64 %a) {
; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmullb.s32 q2, q0, q1
; CHECK-NEXT: vmov lr, r12, d5
; CHECK-NEXT: vmov r3, r2, d4
; CHECK-NEXT: adds.w r3, r3, lr
; CHECK-NEXT: adc.w r2, r2, r12
; CHECK-NEXT: adds r0, r0, r3
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = sext <2 x i32> %x to <2 x i64>
%yy = sext <2 x i32> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, <8 x i16> %y, i32 %a) {
; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlava.u16 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i16> %x to <8 x i32>
%yy = zext <8 x i16> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, <8 x i16> %y, i32 %a) {
; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlava.s16 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i16> %x to <8 x i32>
%yy = sext <8 x i16> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, <4 x i16> %y, i32 %a) {
; CHECK-LABEL: add_v4i16_v4i32_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u16 q1, q1
; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vmlava.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i16> %x to <4 x i32>
%yy = zext <4 x i16> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, <4 x i16> %y, i32 %a) {
; CHECK-LABEL: add_v4i16_v4i32_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s16 q1, q1
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmlava.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i16> %x to <4 x i32>
%yy = sext <4 x i16> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, <8 x i16> %y, i16 %a) {
; CHECK-LABEL: add_v8i16_v8i16_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlava.u16 r0, q0, q1
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%m = mul <8 x i16> %x, %y
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
%r = add i16 %z, %a
ret i16 %r
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
; CHECK-LABEL: add_v8i16_v8i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlalva.u16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i16> %x to <8 x i64>
%yy = zext <8 x i16> %y to <8 x i64>
%m = mul <8 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
; CHECK-LABEL: add_v8i16_v8i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlalva.s16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i16> %x to <8 x i64>
%yy = sext <8 x i16> %y to <8 x i64>
%m = mul <8 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlalva.u16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i16> %x to <8 x i32>
%yy = zext <8 x i16> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%ma = zext <8 x i32> %m to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlalva.s16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i16> %x to <8 x i32>
%yy = sext <8 x i16> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%ma = sext <8 x i32> %m to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sextzext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sextzext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlalva.s16 r0, r1, q0, q0
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i16> %x to <8 x i32>
%m = mul <8 x i32> %xx, %xx
%ma = zext <8 x i32> %m to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %ma)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, i64 %a) {
; CHECK-LABEL: add_v2i16_v2i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov.i64 q2, #0xffff
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmov r2, s6
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: vmov r12, s4
; CHECK-NEXT: umull r2, lr, r3, r2
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: umlal r2, lr, r3, r12
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, lr
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = zext <2 x i16> %x to <2 x i64>
%yy = zext <2 x i16> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y, i64 %a) {
; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov r2, s6
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: sxth r2, r2
; CHECK-NEXT: sxth r3, r3
; CHECK-NEXT: smull r2, r12, r3, r2
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: sxth.w lr, r3
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: sxth r3, r3
; CHECK-NEXT: smlal r2, r12, r3, lr
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r12
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = sext <2 x i16> %x to <2 x i64>
%yy = sext <2 x i16> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, i32 %a) {
; CHECK-LABEL: add_v16i8_v16i32_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlava.u8 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <16 x i8> %x to <16 x i32>
%yy = zext <16 x i8> %y to <16 x i32>
%m = mul <16 x i32> %xx, %yy
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, i32 %a) {
; CHECK-LABEL: add_v16i8_v16i32_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlava.s8 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <16 x i8> %x to <16 x i32>
%yy = sext <16 x i8> %y to <16 x i32>
%m = mul <16 x i32> %xx, %yy
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, i32 %a) {
; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlava.u8 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <16 x i8> %x to <16 x i16>
%yy = zext <16 x i8> %y to <16 x i16>
%m = mul <16 x i16> %xx, %yy
%ma = zext <16 x i16> %m to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, i32 %a) {
; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlava.s8 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%yy = sext <16 x i8> %y to <16 x i16>
%m = mul <16 x i16> %xx, %yy
%ma = sext <16 x i16> %m to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sextzext(<16 x i8> %x, i32 %a) {
; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sextzext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlava.s8 r0, q0, q0
; CHECK-NEXT: bx lr
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%m = mul <16 x i16> %xx, %xx
%ma = zext <16 x i16> %m to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %ma)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, <4 x i8> %y, i32 %a) {
; CHECK-LABEL: add_v4i8_v4i32_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i32 q2, #0xff
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmlava.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i8> %x to <4 x i32>
%yy = zext <4 x i8> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, <4 x i8> %y, i32 %a) {
; CHECK-LABEL: add_v4i8_v4i32_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q1, q1
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmovlb.s16 q1, q1
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmlava.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i8> %x to <4 x i32>
%yy = sext <4 x i8> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, <16 x i8> %y, i16 %a) {
; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlava.u8 r0, q0, q1
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = zext <16 x i8> %x to <16 x i16>
%yy = zext <16 x i8> %y to <16 x i16>
%m = mul <16 x i16> %xx, %yy
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
%r = add i16 %z, %a
ret i16 %r
}
define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, <16 x i8> %y, i16 %a) {
; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlava.s8 r0, q0, q1
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%yy = sext <16 x i8> %y to <16 x i16>
%m = mul <16 x i16> %xx, %yy
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %m)
%r = add i16 %z, %a
ret i16 %r
}
define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, <8 x i8> %y, i16 %a) {
; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u8 q1, q1
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: vmlava.u16 r0, q0, q1
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i8> %x to <8 x i16>
%yy = zext <8 x i8> %y to <8 x i16>
%m = mul <8 x i16> %xx, %yy
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
%r = add i16 %z, %a
ret i16 %r
}
define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, <8 x i8> %y, i16 %a) {
; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q1, q1
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmlava.u16 r0, q0, q1
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i8> %x to <8 x i16>
%yy = sext <8 x i8> %y to <8 x i16>
%m = mul <8 x i16> %xx, %yy
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %m)
%r = add i16 %z, %a
ret i16 %r
}
define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, <16 x i8> %y, i8 %a) {
; CHECK-LABEL: add_v16i8_v16i8_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlava.u8 r0, q0, q1
; CHECK-NEXT: uxtb r0, r0
; CHECK-NEXT: bx lr
entry:
%m = mul <16 x i8> %x, %y
%z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %m)
%r = add i8 %z, %a
ret i8 %r
}
define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, i64 %a) {
; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #32
; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: add r2, sp, #16
; CHECK-NEXT: mov r3, sp
; CHECK-NEXT: vstrw.32 q1, [r2]
; CHECK-NEXT: vstrw.32 q0, [r3]
; CHECK-NEXT: vldrb.u16 q0, [r2]
; CHECK-NEXT: vldrb.u16 q1, [r3]
; CHECK-NEXT: vmlalva.u16 r0, r1, q1, q0
; CHECK-NEXT: vldrb.u16 q0, [r2, #8]
; CHECK-NEXT: vldrb.u16 q1, [r3, #8]
; CHECK-NEXT: vmlalva.u16 r0, r1, q1, q0
; CHECK-NEXT: add sp, #32
; CHECK-NEXT: bx lr
entry:
%xx = zext <16 x i8> %x to <16 x i64>
%yy = zext <16 x i8> %y to <16 x i64>
%m = mul <16 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, i64 %a) {
; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #32
; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: add r2, sp, #16
; CHECK-NEXT: mov r3, sp
; CHECK-NEXT: vstrw.32 q1, [r2]
; CHECK-NEXT: vstrw.32 q0, [r3]
; CHECK-NEXT: vldrb.s16 q0, [r2]
; CHECK-NEXT: vldrb.s16 q1, [r3]
; CHECK-NEXT: vmlalva.s16 r0, r1, q1, q0
; CHECK-NEXT: vldrb.s16 q0, [r2, #8]
; CHECK-NEXT: vldrb.s16 q1, [r3, #8]
; CHECK-NEXT: vmlalva.s16 r0, r1, q1, q0
; CHECK-NEXT: add sp, #32
; CHECK-NEXT: bx lr
entry:
%xx = sext <16 x i8> %x to <16 x i64>
%yy = sext <16 x i8> %y to <16 x i64>
%m = mul <16 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext_load(<16 x i8> *%xp, <16 x i8> *%yp, i64 %a) {
; CHECK-LABEL: add_v16i8_v16i64_acc_zext_load:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.u16 q0, [r1]
; CHECK-NEXT: vldrb.u16 q1, [r0]
; CHECK-NEXT: vmlalva.u16 r2, r3, q1, q0
; CHECK-NEXT: vldrb.u16 q0, [r1, #8]
; CHECK-NEXT: vldrb.u16 q1, [r0, #8]
; CHECK-NEXT: vmlalva.u16 r2, r3, q1, q0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: mov r1, r3
; CHECK-NEXT: bx lr
entry:
%x = load <16 x i8>, <16 x i8>* %xp
%y = load <16 x i8>, <16 x i8>* %yp
%xx = zext <16 x i8> %x to <16 x i64>
%yy = zext <16 x i8> %y to <16 x i64>
%m = mul <16 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext_load(<16 x i8> *%xp, <16 x i8> *%yp, i64 %a) {
; CHECK-LABEL: add_v16i8_v16i64_acc_sext_load:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrb.s16 q0, [r1]
; CHECK-NEXT: vldrb.s16 q1, [r0]
; CHECK-NEXT: vmlalva.s16 r2, r3, q1, q0
; CHECK-NEXT: vldrb.s16 q0, [r1, #8]
; CHECK-NEXT: vldrb.s16 q1, [r0, #8]
; CHECK-NEXT: vmlalva.s16 r2, r3, q1, q0
; CHECK-NEXT: mov r0, r2
; CHECK-NEXT: mov r1, r3
; CHECK-NEXT: bx lr
entry:
%x = load <16 x i8>, <16 x i8>* %xp
%y = load <16 x i8>, <16 x i8>* %yp
%xx = sext <16 x i8> %x to <16 x i64>
%yy = sext <16 x i8> %y to <16 x i64>
%m = mul <16 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, i64 %a) {
; CHECK-LABEL: add_v2i8_v2i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov.i64 q2, #0xff
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmov r2, s6
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: umull r12, lr, r3, r2
; CHECK-NEXT: vmov r2, s4
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: add r2, r12
; CHECK-NEXT: orr.w r3, r3, lr
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = zext <2 x i8> %x to <2 x i64>
%yy = zext <2 x i8> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, i64 %a) {
; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov r2, s6
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smull r2, r12, r3, r2
; CHECK-NEXT: vmov r3, s4
; CHECK-NEXT: sxtb.w lr, r3
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smlal r2, r12, r3, lr
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r12
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = sext <2 x i8> %x to <2 x i64>
%yy = sext <2 x i8> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, i64 %a) {
; CHECK-LABEL: add_v2i64_v2i64_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
; CHECK-NEXT: vmov r2, r12, d3
; CHECK-NEXT: vmov r3, lr, d1
; CHECK-NEXT: vmov r4, r6, d0
; CHECK-NEXT: umull r8, r5, r3, r2
; CHECK-NEXT: mla r3, r3, r12, r5
; CHECK-NEXT: mla r12, lr, r2, r3
; CHECK-NEXT: vmov r3, r5, d2
; CHECK-NEXT: umull r7, r2, r4, r3
; CHECK-NEXT: mla r2, r4, r5, r2
; CHECK-NEXT: mla r2, r6, r3, r2
; CHECK-NEXT: adds.w r3, r7, r8
; CHECK-NEXT: adc.w r2, r2, r12
; CHECK-NEXT: adds r0, r0, r3
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc}
entry:
%m = mul <2 x i64> %x, %y
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)