1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-02-01 13:11:39 +01:00
llvm-mirror/test/CodeGen/Thumb2/mve-vecreduce-mla.ll
David Green 9a7508e506 [ARM] Extra MVE VMLAV reduction patterns
These patterns for i8 and i16 VMLA's were missing. They end up from
legalized vector.reduce.add.v8i16 and vector.reduce.add.v16i8, and
although the instruction works differently (the mul and add are
performed in a higher precision), I believe it is OK because only an
i8/i16 are demanded from them, and so the results will be the same. At
least, they pass any testing I can think to run on them.

There are some tests that end up looking worse, but are quite artificial
due to passing half vector types through a call boundary. I would not
expect the vmull to realistically come up like that, and a vmlava is
likely better a lot of the time.

Differential Revision: https://reviews.llvm.org/D80524
2020-05-29 16:23:24 +01:00

1942 lines
63 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
define arm_aapcs_vfpcc i32 @add_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: add_v4i32_v4i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlav.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%m = mul <4 x i32> %x, %y
%z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: add_v4i32_v4i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlalv.u32 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i32> %x to <4 x i64>
%yy = zext <4 x i32> %y to <4 x i64>
%m = mul <4 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: add_v4i32_v4i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlalv.s32 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i32> %x to <4 x i64>
%yy = sext <4 x i32> %y to <4 x i64>
%m = mul <4 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
; CHECK-LABEL: add_v2i32_v2i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmullb.u32 q2, q0, q1
; CHECK-NEXT: vmov r0, s10
; CHECK-NEXT: vmov r3, s8
; CHECK-NEXT: vmov r1, s11
; CHECK-NEXT: vmov r2, s9
; CHECK-NEXT: adds r0, r0, r3
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: bx lr
entry:
%xx = zext <2 x i32> %x to <2 x i64>
%yy = zext <2 x i32> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) {
; CHECK-LABEL: add_v2i32_v2i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmullb.s32 q2, q0, q1
; CHECK-NEXT: vmov r0, s10
; CHECK-NEXT: vmov r3, s8
; CHECK-NEXT: vmov r1, s11
; CHECK-NEXT: vmov r2, s9
; CHECK-NEXT: adds r0, r0, r3
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: bx lr
entry:
%xx = sext <2 x i32> %x to <2 x i64>
%yy = sext <2 x i32> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: add_v8i16_v8i32_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlav.u16 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i16> %x to <8 x i32>
%yy = zext <8 x i16> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: add_v8i16_v8i32_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlav.s16 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i16> %x to <8 x i32>
%yy = sext <8 x i16> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-LABEL: add_v4i16_v4i32_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u16 q1, q1
; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vmlav.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i16> %x to <4 x i32>
%yy = zext <4 x i16> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-LABEL: add_v4i16_v4i32_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s16 q1, q1
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmlav.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i16> %x to <4 x i32>
%yy = sext <4 x i16> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: add_v8i16_v8i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlav.u16 r0, q0, q1
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%m = mul <8 x i16> %x, %y
%z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
ret i16 %z
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: add_v8i16_v8i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlalv.u16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i16> %x to <8 x i64>
%yy = zext <8 x i16> %y to <8 x i64>
%m = mul <8 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: add_v8i16_v8i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlalv.s16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i16> %x to <8 x i64>
%yy = sext <8 x i16> %y to <8 x i64>
%m = mul <8 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
; CHECK-LABEL: add_v2i16_v2i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i64 q2, #0xffff
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmov r0, s4
; CHECK-NEXT: vmov r1, s0
; CHECK-NEXT: vmov r2, s6
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: umull r0, r1, r1, r0
; CHECK-NEXT: umlal r0, r1, r3, r2
; CHECK-NEXT: bx lr
entry:
%xx = zext <2 x i16> %x to <2 x i64>
%yy = zext <2 x i16> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) {
; CHECK-LABEL: add_v2i16_v2i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r0, s4
; CHECK-NEXT: vmov r1, s0
; CHECK-NEXT: vmov r2, s6
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: sxth r1, r1
; CHECK-NEXT: smull r0, r1, r1, r0
; CHECK-NEXT: sxth r2, r2
; CHECK-NEXT: sxth r3, r3
; CHECK-NEXT: smlal r0, r1, r3, r2
; CHECK-NEXT: bx lr
entry:
%xx = sext <2 x i16> %x to <2 x i64>
%yy = sext <2 x i16> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_v16i8_v16i32_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlav.u8 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <16 x i8> %x to <16 x i32>
%yy = zext <16 x i8> %y to <16 x i32>
%m = mul <16 x i32> %xx, %yy
%z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_v16i8_v16i32_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlav.s8 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <16 x i8> %x to <16 x i32>
%yy = sext <16 x i8> %y to <16 x i32>
%m = mul <16 x i32> %xx, %yy
%z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: add_v4i8_v4i32_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i32 q2, #0xff
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmlav.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i8> %x to <4 x i32>
%yy = zext <4 x i8> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y) {
; CHECK-LABEL: add_v4i8_v4i32_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q1, q1
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmovlb.s16 q1, q1
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmlav.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i8> %x to <4 x i32>
%yy = sext <4 x i8> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
ret i32 %z
}
define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_v16i8_v16i16_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.u8 r0, q1[8]
; CHECK-NEXT: vmov.16 q2[0], r0
; CHECK-NEXT: vmov.u8 r0, q1[9]
; CHECK-NEXT: vmov.16 q2[1], r0
; CHECK-NEXT: vmov.u8 r0, q1[10]
; CHECK-NEXT: vmov.16 q2[2], r0
; CHECK-NEXT: vmov.u8 r0, q1[11]
; CHECK-NEXT: vmov.16 q2[3], r0
; CHECK-NEXT: vmov.u8 r0, q1[12]
; CHECK-NEXT: vmov.16 q2[4], r0
; CHECK-NEXT: vmov.u8 r0, q1[13]
; CHECK-NEXT: vmov.16 q2[5], r0
; CHECK-NEXT: vmov.u8 r0, q1[14]
; CHECK-NEXT: vmov.16 q2[6], r0
; CHECK-NEXT: vmov.u8 r0, q1[15]
; CHECK-NEXT: vmov.16 q2[7], r0
; CHECK-NEXT: vmov.u8 r0, q0[8]
; CHECK-NEXT: vmov.16 q3[0], r0
; CHECK-NEXT: vmov.u8 r0, q0[9]
; CHECK-NEXT: vmov.16 q3[1], r0
; CHECK-NEXT: vmov.u8 r0, q0[10]
; CHECK-NEXT: vmov.16 q3[2], r0
; CHECK-NEXT: vmov.u8 r0, q0[11]
; CHECK-NEXT: vmov.16 q3[3], r0
; CHECK-NEXT: vmov.u8 r0, q0[12]
; CHECK-NEXT: vmov.16 q3[4], r0
; CHECK-NEXT: vmov.u8 r0, q0[13]
; CHECK-NEXT: vmov.16 q3[5], r0
; CHECK-NEXT: vmov.u8 r0, q0[14]
; CHECK-NEXT: vmov.16 q3[6], r0
; CHECK-NEXT: vmov.u8 r0, q0[15]
; CHECK-NEXT: vmov.16 q3[7], r0
; CHECK-NEXT: vmov.u8 r0, q1[0]
; CHECK-NEXT: vmullb.u8 q2, q3, q2
; CHECK-NEXT: vmov.16 q3[0], r0
; CHECK-NEXT: vmov.u8 r0, q1[1]
; CHECK-NEXT: vmov.16 q3[1], r0
; CHECK-NEXT: vmov.u8 r0, q1[2]
; CHECK-NEXT: vmov.16 q3[2], r0
; CHECK-NEXT: vmov.u8 r0, q1[3]
; CHECK-NEXT: vmov.16 q3[3], r0
; CHECK-NEXT: vmov.u8 r0, q1[4]
; CHECK-NEXT: vmov.16 q3[4], r0
; CHECK-NEXT: vmov.u8 r0, q1[5]
; CHECK-NEXT: vmov.16 q3[5], r0
; CHECK-NEXT: vmov.u8 r0, q1[6]
; CHECK-NEXT: vmov.16 q3[6], r0
; CHECK-NEXT: vmov.u8 r0, q1[7]
; CHECK-NEXT: vmov.16 q3[7], r0
; CHECK-NEXT: vmov.u8 r0, q0[0]
; CHECK-NEXT: vmov.16 q1[0], r0
; CHECK-NEXT: vmov.u8 r0, q0[1]
; CHECK-NEXT: vmov.16 q1[1], r0
; CHECK-NEXT: vmov.u8 r0, q0[2]
; CHECK-NEXT: vmov.16 q1[2], r0
; CHECK-NEXT: vmov.u8 r0, q0[3]
; CHECK-NEXT: vmov.16 q1[3], r0
; CHECK-NEXT: vmov.u8 r0, q0[4]
; CHECK-NEXT: vmov.16 q1[4], r0
; CHECK-NEXT: vmov.u8 r0, q0[5]
; CHECK-NEXT: vmov.16 q1[5], r0
; CHECK-NEXT: vmov.u8 r0, q0[6]
; CHECK-NEXT: vmov.16 q1[6], r0
; CHECK-NEXT: vmov.u8 r0, q0[7]
; CHECK-NEXT: vmov.16 q1[7], r0
; CHECK-NEXT: vmullb.u8 q0, q1, q3
; CHECK-NEXT: vadd.i16 q0, q0, q2
; CHECK-NEXT: vaddv.u16 r0, q0
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = zext <16 x i8> %x to <16 x i16>
%yy = zext <16 x i8> %y to <16 x i16>
%m = mul <16 x i16> %xx, %yy
%z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
ret i16 %z
}
define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_v16i8_v16i16_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.u8 r0, q1[8]
; CHECK-NEXT: vmov.16 q2[0], r0
; CHECK-NEXT: vmov.u8 r0, q1[9]
; CHECK-NEXT: vmov.16 q2[1], r0
; CHECK-NEXT: vmov.u8 r0, q1[10]
; CHECK-NEXT: vmov.16 q2[2], r0
; CHECK-NEXT: vmov.u8 r0, q1[11]
; CHECK-NEXT: vmov.16 q2[3], r0
; CHECK-NEXT: vmov.u8 r0, q1[12]
; CHECK-NEXT: vmov.16 q2[4], r0
; CHECK-NEXT: vmov.u8 r0, q1[13]
; CHECK-NEXT: vmov.16 q2[5], r0
; CHECK-NEXT: vmov.u8 r0, q1[14]
; CHECK-NEXT: vmov.16 q2[6], r0
; CHECK-NEXT: vmov.u8 r0, q1[15]
; CHECK-NEXT: vmov.16 q2[7], r0
; CHECK-NEXT: vmov.u8 r0, q0[8]
; CHECK-NEXT: vmov.16 q3[0], r0
; CHECK-NEXT: vmov.u8 r0, q0[9]
; CHECK-NEXT: vmov.16 q3[1], r0
; CHECK-NEXT: vmov.u8 r0, q0[10]
; CHECK-NEXT: vmov.16 q3[2], r0
; CHECK-NEXT: vmov.u8 r0, q0[11]
; CHECK-NEXT: vmov.16 q3[3], r0
; CHECK-NEXT: vmov.u8 r0, q0[12]
; CHECK-NEXT: vmov.16 q3[4], r0
; CHECK-NEXT: vmov.u8 r0, q0[13]
; CHECK-NEXT: vmov.16 q3[5], r0
; CHECK-NEXT: vmov.u8 r0, q0[14]
; CHECK-NEXT: vmov.16 q3[6], r0
; CHECK-NEXT: vmov.u8 r0, q0[15]
; CHECK-NEXT: vmov.16 q3[7], r0
; CHECK-NEXT: vmov.u8 r0, q1[0]
; CHECK-NEXT: vmullb.s8 q2, q3, q2
; CHECK-NEXT: vmov.16 q3[0], r0
; CHECK-NEXT: vmov.u8 r0, q1[1]
; CHECK-NEXT: vmov.16 q3[1], r0
; CHECK-NEXT: vmov.u8 r0, q1[2]
; CHECK-NEXT: vmov.16 q3[2], r0
; CHECK-NEXT: vmov.u8 r0, q1[3]
; CHECK-NEXT: vmov.16 q3[3], r0
; CHECK-NEXT: vmov.u8 r0, q1[4]
; CHECK-NEXT: vmov.16 q3[4], r0
; CHECK-NEXT: vmov.u8 r0, q1[5]
; CHECK-NEXT: vmov.16 q3[5], r0
; CHECK-NEXT: vmov.u8 r0, q1[6]
; CHECK-NEXT: vmov.16 q3[6], r0
; CHECK-NEXT: vmov.u8 r0, q1[7]
; CHECK-NEXT: vmov.16 q3[7], r0
; CHECK-NEXT: vmov.u8 r0, q0[0]
; CHECK-NEXT: vmov.16 q1[0], r0
; CHECK-NEXT: vmov.u8 r0, q0[1]
; CHECK-NEXT: vmov.16 q1[1], r0
; CHECK-NEXT: vmov.u8 r0, q0[2]
; CHECK-NEXT: vmov.16 q1[2], r0
; CHECK-NEXT: vmov.u8 r0, q0[3]
; CHECK-NEXT: vmov.16 q1[3], r0
; CHECK-NEXT: vmov.u8 r0, q0[4]
; CHECK-NEXT: vmov.16 q1[4], r0
; CHECK-NEXT: vmov.u8 r0, q0[5]
; CHECK-NEXT: vmov.16 q1[5], r0
; CHECK-NEXT: vmov.u8 r0, q0[6]
; CHECK-NEXT: vmov.16 q1[6], r0
; CHECK-NEXT: vmov.u8 r0, q0[7]
; CHECK-NEXT: vmov.16 q1[7], r0
; CHECK-NEXT: vmullb.s8 q0, q1, q3
; CHECK-NEXT: vadd.i16 q0, q0, q2
; CHECK-NEXT: vaddv.u16 r0, q0
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%yy = sext <16 x i8> %y to <16 x i16>
%m = mul <16 x i16> %xx, %yy
%z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
ret i16 %z
}
define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-LABEL: add_v8i8_v8i16_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u8 q1, q1
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: vmlav.u16 r0, q0, q1
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i8> %x to <8 x i16>
%yy = zext <8 x i8> %y to <8 x i16>
%m = mul <8 x i16> %xx, %yy
%z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
ret i16 %z
}
define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-LABEL: add_v8i8_v8i16_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q1, q1
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmlav.u16 r0, q0, q1
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i8> %x to <8 x i16>
%yy = sext <8 x i8> %y to <8 x i16>
%m = mul <8 x i16> %xx, %yy
%z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
ret i16 %z
}
define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_v16i8_v16i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlav.u8 r0, q0, q1
; CHECK-NEXT: uxtb r0, r0
; CHECK-NEXT: bx lr
entry:
%m = mul <16 x i8> %x, %y
%z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %m)
ret i8 %z
}
define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_v16i8_v16i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: vmov.u8 r0, q1[0]
; CHECK-NEXT: vmov.u8 r1, q0[0]
; CHECK-NEXT: vmov.32 q3[0], r0
; CHECK-NEXT: vmov.u8 r0, q1[1]
; CHECK-NEXT: vmov.32 q4[0], r1
; CHECK-NEXT: vmov.u8 r1, q0[1]
; CHECK-NEXT: vmov.32 q3[2], r0
; CHECK-NEXT: vmov.i64 q2, #0xff
; CHECK-NEXT: vmov.32 q4[2], r1
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vand q4, q4, q2
; CHECK-NEXT: vmov r0, s14
; CHECK-NEXT: vmov r1, s18
; CHECK-NEXT: vmov r2, s12
; CHECK-NEXT: vmov r3, s16
; CHECK-NEXT: umull r12, r1, r1, r0
; CHECK-NEXT: vmov.u8 r0, q0[2]
; CHECK-NEXT: vmov.32 q4[0], r0
; CHECK-NEXT: vmov.u8 r0, q0[3]
; CHECK-NEXT: vmov.32 q4[2], r0
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: vand q4, q4, q2
; CHECK-NEXT: vmov r0, s16
; CHECK-NEXT: orr.w lr, r3, r1
; CHECK-NEXT: vmov.u8 r3, q1[2]
; CHECK-NEXT: vmov.32 q3[0], r3
; CHECK-NEXT: vmov.u8 r3, q1[3]
; CHECK-NEXT: vmov.32 q3[2], r3
; CHECK-NEXT: add r2, r12
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vmov r3, s12
; CHECK-NEXT: umull r0, r3, r0, r3
; CHECK-NEXT: vmov.32 q5[0], r0
; CHECK-NEXT: vmov r0, s14
; CHECK-NEXT: vmov.32 q5[1], r3
; CHECK-NEXT: vmov r3, s18
; CHECK-NEXT: umull r0, r3, r3, r0
; CHECK-NEXT: vmov.32 q5[2], r0
; CHECK-NEXT: vmov.32 q5[3], r3
; CHECK-NEXT: vmov r1, s20
; CHECK-NEXT: vmov r0, s21
; CHECK-NEXT: adds r1, r1, r2
; CHECK-NEXT: adc.w r2, lr, r0
; CHECK-NEXT: vmov r0, s22
; CHECK-NEXT: adds.w r12, r1, r0
; CHECK-NEXT: adc.w r1, r2, r3
; CHECK-NEXT: vmov.u8 r2, q1[4]
; CHECK-NEXT: vmov.u8 r3, q0[4]
; CHECK-NEXT: vmov.32 q3[0], r2
; CHECK-NEXT: vmov.u8 r2, q1[5]
; CHECK-NEXT: vmov.32 q4[0], r3
; CHECK-NEXT: vmov.u8 r3, q0[5]
; CHECK-NEXT: vmov.32 q3[2], r2
; CHECK-NEXT: vmov.32 q4[2], r3
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vand q4, q4, q2
; CHECK-NEXT: vmov r2, s12
; CHECK-NEXT: vmov r3, s16
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q5[0], r2
; CHECK-NEXT: vmov r2, s14
; CHECK-NEXT: vmov.32 q5[1], r3
; CHECK-NEXT: vmov r3, s18
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q5[2], r2
; CHECK-NEXT: vmov.32 q5[3], r3
; CHECK-NEXT: vmov r0, s20
; CHECK-NEXT: vmov r2, s21
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: vmov r2, s22
; CHECK-NEXT: adds.w r12, r0, r2
; CHECK-NEXT: vmov.u8 r2, q1[6]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.u8 r3, q0[6]
; CHECK-NEXT: vmov.32 q3[0], r2
; CHECK-NEXT: vmov.u8 r2, q1[7]
; CHECK-NEXT: vmov.32 q4[0], r3
; CHECK-NEXT: vmov.u8 r3, q0[7]
; CHECK-NEXT: vmov.32 q3[2], r2
; CHECK-NEXT: vmov.32 q4[2], r3
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vand q4, q4, q2
; CHECK-NEXT: vmov r2, s12
; CHECK-NEXT: vmov r3, s16
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q5[0], r2
; CHECK-NEXT: vmov r2, s14
; CHECK-NEXT: vmov.32 q5[1], r3
; CHECK-NEXT: vmov r3, s18
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q5[2], r2
; CHECK-NEXT: vmov.32 q5[3], r3
; CHECK-NEXT: vmov r0, s20
; CHECK-NEXT: vmov r2, s21
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: vmov r2, s22
; CHECK-NEXT: adds.w r12, r0, r2
; CHECK-NEXT: vmov.u8 r2, q1[8]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.u8 r3, q0[8]
; CHECK-NEXT: vmov.32 q3[0], r2
; CHECK-NEXT: vmov.u8 r2, q1[9]
; CHECK-NEXT: vmov.32 q4[0], r3
; CHECK-NEXT: vmov.u8 r3, q0[9]
; CHECK-NEXT: vmov.32 q3[2], r2
; CHECK-NEXT: vmov.32 q4[2], r3
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vand q4, q4, q2
; CHECK-NEXT: vmov r2, s12
; CHECK-NEXT: vmov r3, s16
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q5[0], r2
; CHECK-NEXT: vmov r2, s14
; CHECK-NEXT: vmov.32 q5[1], r3
; CHECK-NEXT: vmov r3, s18
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q5[2], r2
; CHECK-NEXT: vmov.32 q5[3], r3
; CHECK-NEXT: vmov r0, s20
; CHECK-NEXT: vmov r2, s21
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: vmov r2, s22
; CHECK-NEXT: adds.w r12, r0, r2
; CHECK-NEXT: vmov.u8 r2, q1[10]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.u8 r3, q0[10]
; CHECK-NEXT: vmov.32 q3[0], r2
; CHECK-NEXT: vmov.u8 r2, q1[11]
; CHECK-NEXT: vmov.32 q4[0], r3
; CHECK-NEXT: vmov.u8 r3, q0[11]
; CHECK-NEXT: vmov.32 q3[2], r2
; CHECK-NEXT: vmov.32 q4[2], r3
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vand q4, q4, q2
; CHECK-NEXT: vmov r2, s12
; CHECK-NEXT: vmov r3, s16
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q5[0], r2
; CHECK-NEXT: vmov r2, s14
; CHECK-NEXT: vmov.32 q5[1], r3
; CHECK-NEXT: vmov r3, s18
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q5[2], r2
; CHECK-NEXT: vmov.32 q5[3], r3
; CHECK-NEXT: vmov r0, s20
; CHECK-NEXT: vmov r2, s21
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: vmov r2, s22
; CHECK-NEXT: adds.w r12, r0, r2
; CHECK-NEXT: vmov.u8 r2, q1[12]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.u8 r3, q0[12]
; CHECK-NEXT: vmov.32 q3[0], r2
; CHECK-NEXT: vmov.u8 r2, q1[13]
; CHECK-NEXT: vmov.32 q4[0], r3
; CHECK-NEXT: vmov.u8 r3, q0[13]
; CHECK-NEXT: vmov.32 q3[2], r2
; CHECK-NEXT: vmov.32 q4[2], r3
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vand q4, q4, q2
; CHECK-NEXT: vmov r2, s12
; CHECK-NEXT: vmov r3, s16
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q5[0], r2
; CHECK-NEXT: vmov r2, s14
; CHECK-NEXT: vmov.32 q5[1], r3
; CHECK-NEXT: vmov r3, s18
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q5[2], r2
; CHECK-NEXT: vmov.32 q5[3], r3
; CHECK-NEXT: vmov r0, s20
; CHECK-NEXT: vmov r2, s21
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: vmov r2, s22
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: vmov.u8 r2, q1[14]
; CHECK-NEXT: vmov.32 q3[0], r2
; CHECK-NEXT: vmov.u8 r2, q1[15]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.32 q3[2], r2
; CHECK-NEXT: vmov.u8 r3, q0[14]
; CHECK-NEXT: vand q1, q3, q2
; CHECK-NEXT: vmov.32 q3[0], r3
; CHECK-NEXT: vmov.u8 r3, q0[15]
; CHECK-NEXT: vmov.32 q3[2], r3
; CHECK-NEXT: vmov r2, s4
; CHECK-NEXT: vand q0, q3, q2
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: umlal r0, r1, r3, r2
; CHECK-NEXT: vmov r2, s6
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: umlal r0, r1, r3, r2
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = zext <16 x i8> %x to <16 x i64>
%yy = zext <16 x i8> %y to <16 x i64>
%m = mul <16 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: add_v16i8_v16i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.u8 r0, q1[0]
; CHECK-NEXT: vmov.u8 r1, q0[0]
; CHECK-NEXT: sxtb r0, r0
; CHECK-NEXT: sxtb r1, r1
; CHECK-NEXT: smull r0, r1, r1, r0
; CHECK-NEXT: vmov.32 q2[0], r0
; CHECK-NEXT: vmov.u8 r0, q1[1]
; CHECK-NEXT: vmov.32 q2[1], r1
; CHECK-NEXT: vmov.u8 r1, q0[1]
; CHECK-NEXT: sxtb r0, r0
; CHECK-NEXT: sxtb r1, r1
; CHECK-NEXT: smull r0, r1, r1, r0
; CHECK-NEXT: vmov.32 q2[2], r0
; CHECK-NEXT: vmov.32 q2[3], r1
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: vmov r3, s8
; CHECK-NEXT: vmov r0, s9
; CHECK-NEXT: adds r2, r2, r3
; CHECK-NEXT: vmov.u8 r3, q0[2]
; CHECK-NEXT: adc.w r12, r0, r1
; CHECK-NEXT: vmov.u8 r1, q1[2]
; CHECK-NEXT: sxtb r1, r1
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smull r1, r3, r3, r1
; CHECK-NEXT: vmov.32 q2[0], r1
; CHECK-NEXT: vmov.u8 r1, q1[3]
; CHECK-NEXT: vmov.32 q2[1], r3
; CHECK-NEXT: vmov.u8 r3, q0[3]
; CHECK-NEXT: sxtb r1, r1
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smull r1, r3, r3, r1
; CHECK-NEXT: vmov.32 q2[2], r1
; CHECK-NEXT: vmov.32 q2[3], r3
; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vmov r1, s9
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: adc.w r1, r1, r12
; CHECK-NEXT: adds.w r12, r0, r2
; CHECK-NEXT: vmov.u8 r2, q1[4]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.u8 r3, q0[4]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q2[0], r2
; CHECK-NEXT: vmov.u8 r2, q1[5]
; CHECK-NEXT: vmov.32 q2[1], r3
; CHECK-NEXT: vmov.u8 r3, q0[5]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q2[2], r2
; CHECK-NEXT: vmov.32 q2[3], r3
; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vmov r2, s9
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: adds.w r12, r0, r2
; CHECK-NEXT: vmov.u8 r2, q1[6]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.u8 r3, q0[6]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q2[0], r2
; CHECK-NEXT: vmov.u8 r2, q1[7]
; CHECK-NEXT: vmov.32 q2[1], r3
; CHECK-NEXT: vmov.u8 r3, q0[7]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q2[2], r2
; CHECK-NEXT: vmov.32 q2[3], r3
; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vmov r2, s9
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: adds.w r12, r0, r2
; CHECK-NEXT: vmov.u8 r2, q1[8]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.u8 r3, q0[8]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q2[0], r2
; CHECK-NEXT: vmov.u8 r2, q1[9]
; CHECK-NEXT: vmov.32 q2[1], r3
; CHECK-NEXT: vmov.u8 r3, q0[9]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q2[2], r2
; CHECK-NEXT: vmov.32 q2[3], r3
; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vmov r2, s9
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: adds.w r12, r0, r2
; CHECK-NEXT: vmov.u8 r2, q1[10]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.u8 r3, q0[10]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q2[0], r2
; CHECK-NEXT: vmov.u8 r2, q1[11]
; CHECK-NEXT: vmov.32 q2[1], r3
; CHECK-NEXT: vmov.u8 r3, q0[11]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q2[2], r2
; CHECK-NEXT: vmov.32 q2[3], r3
; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vmov r2, s9
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: adds.w r12, r0, r2
; CHECK-NEXT: vmov.u8 r2, q1[12]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.u8 r3, q0[12]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q2[0], r2
; CHECK-NEXT: vmov.u8 r2, q1[13]
; CHECK-NEXT: vmov.32 q2[1], r3
; CHECK-NEXT: vmov.u8 r3, q0[13]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q2[2], r2
; CHECK-NEXT: vmov.32 q2[3], r3
; CHECK-NEXT: vmov r0, s8
; CHECK-NEXT: vmov r2, s9
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: vmov.u8 r2, q1[14]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.u8 r3, q0[14]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smlal r0, r1, r3, r2
; CHECK-NEXT: vmov.u8 r2, q1[15]
; CHECK-NEXT: vmov.u8 r3, q0[15]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smlal r0, r1, r3, r2
; CHECK-NEXT: bx lr
entry:
%xx = sext <16 x i8> %x to <16 x i64>
%yy = sext <16 x i8> %y to <16 x i64>
%m = mul <16 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
; CHECK-LABEL: add_v2i8_v2i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i64 q2, #0xff
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmov r0, s6
; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: vmov r2, s4
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: umull r0, r1, r1, r0
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: orrs r1, r3
; CHECK-NEXT: bx lr
entry:
%xx = zext <2 x i8> %x to <2 x i64>
%yy = zext <2 x i8> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) {
; CHECK-LABEL: add_v2i8_v2i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r0, s4
; CHECK-NEXT: vmov r1, s0
; CHECK-NEXT: vmov r2, s6
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: sxtb r0, r0
; CHECK-NEXT: sxtb r1, r1
; CHECK-NEXT: smull r0, r1, r1, r0
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smlal r0, r1, r3, r2
; CHECK-NEXT: bx lr
entry:
%xx = sext <2 x i8> %x to <2 x i64>
%yy = sext <2 x i8> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) {
; CHECK-LABEL: add_v2i64_v2i64:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: vmov r0, s4
; CHECK-NEXT: vmov r1, s0
; CHECK-NEXT: vmov r2, s5
; CHECK-NEXT: vmov r4, s7
; CHECK-NEXT: umull r12, r3, r1, r0
; CHECK-NEXT: mla r1, r1, r2, r3
; CHECK-NEXT: vmov r2, s1
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: vmov.32 q2[0], r12
; CHECK-NEXT: mla r1, r2, r0, r1
; CHECK-NEXT: vmov r2, s6
; CHECK-NEXT: vmov.32 q2[1], r1
; CHECK-NEXT: vmov r12, s8
; CHECK-NEXT: umull lr, r0, r3, r2
; CHECK-NEXT: mla r0, r3, r4, r0
; CHECK-NEXT: vmov r3, s3
; CHECK-NEXT: mla r2, r3, r2, r0
; CHECK-NEXT: adds.w r0, r12, lr
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: pop {r4, pc}
entry:
%m = mul <2 x i64> %x, %y
%z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
ret i64 %z
}
define arm_aapcs_vfpcc i32 @add_v4i32_v4i32_acc(<4 x i32> %x, <4 x i32> %y, i32 %a) {
; CHECK-LABEL: add_v4i32_v4i32_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlava.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%m = mul <4 x i32> %x, %y
%z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, <4 x i32> %y, i64 %a) {
; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlalva.u32 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i32> %x to <4 x i64>
%yy = zext <4 x i32> %y to <4 x i64>
%m = mul <4 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, <4 x i32> %y, i64 %a) {
; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlalva.s32 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i32> %x to <4 x i64>
%yy = sext <4 x i32> %y to <4 x i64>
%m = mul <4 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %y, i64 %a) {
; CHECK-LABEL: add_v2i32_v2i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmullb.u32 q2, q0, q1
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: vmov r3, s8
; CHECK-NEXT: vmov r12, s11
; CHECK-NEXT: vmov lr, s9
; CHECK-NEXT: adds r2, r2, r3
; CHECK-NEXT: adc.w r3, lr, r12
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = zext <2 x i32> %x to <2 x i64>
%yy = zext <2 x i32> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %y, i64 %a) {
; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmullb.s32 q2, q0, q1
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: vmov r3, s8
; CHECK-NEXT: vmov r12, s11
; CHECK-NEXT: vmov lr, s9
; CHECK-NEXT: adds r2, r2, r3
; CHECK-NEXT: adc.w r3, lr, r12
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = sext <2 x i32> %x to <2 x i64>
%yy = sext <2 x i32> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, <8 x i16> %y, i32 %a) {
; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlava.u16 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i16> %x to <8 x i32>
%yy = zext <8 x i16> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, <8 x i16> %y, i32 %a) {
; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlava.s16 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i16> %x to <8 x i32>
%yy = sext <8 x i16> %y to <8 x i32>
%m = mul <8 x i32> %xx, %yy
%z = call i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32> %m)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, <4 x i16> %y, i32 %a) {
; CHECK-LABEL: add_v4i16_v4i32_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u16 q1, q1
; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vmlava.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i16> %x to <4 x i32>
%yy = zext <4 x i16> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, <4 x i16> %y, i32 %a) {
; CHECK-LABEL: add_v4i16_v4i32_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s16 q1, q1
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmlava.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i16> %x to <4 x i32>
%yy = sext <4 x i16> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, <8 x i16> %y, i16 %a) {
; CHECK-LABEL: add_v8i16_v8i16_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlava.u16 r0, q0, q1
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%m = mul <8 x i16> %x, %y
%z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
%r = add i16 %z, %a
ret i16 %r
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
; CHECK-LABEL: add_v8i16_v8i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlalva.u16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i16> %x to <8 x i64>
%yy = zext <8 x i16> %y to <8 x i64>
%m = mul <8 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, i64 %a) {
; CHECK-LABEL: add_v8i16_v8i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlalva.s16 r0, r1, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i16> %x to <8 x i64>
%yy = sext <8 x i16> %y to <8 x i64>
%m = mul <8 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, i64 %a) {
; CHECK-LABEL: add_v2i16_v2i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov.i64 q2, #0xffff
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmov r2, s4
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: vmov r12, s6
; CHECK-NEXT: umull r2, lr, r3, r2
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: umlal r2, lr, r3, r12
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, lr
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = zext <2 x i16> %x to <2 x i64>
%yy = zext <2 x i16> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y, i64 %a) {
; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov r2, s4
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: sxth r2, r2
; CHECK-NEXT: sxth r3, r3
; CHECK-NEXT: smull r2, r12, r3, r2
; CHECK-NEXT: vmov r3, s6
; CHECK-NEXT: sxth.w lr, r3
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: sxth r3, r3
; CHECK-NEXT: smlal r2, r12, r3, lr
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r12
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = sext <2 x i16> %x to <2 x i64>
%yy = sext <2 x i16> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, i32 %a) {
; CHECK-LABEL: add_v16i8_v16i32_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlava.u8 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <16 x i8> %x to <16 x i32>
%yy = zext <16 x i8> %y to <16 x i32>
%m = mul <16 x i32> %xx, %yy
%z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, i32 %a) {
; CHECK-LABEL: add_v16i8_v16i32_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlava.s8 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <16 x i8> %x to <16 x i32>
%yy = sext <16 x i8> %y to <16 x i32>
%m = mul <16 x i32> %xx, %yy
%z = call i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32> %m)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, <4 x i8> %y, i32 %a) {
; CHECK-LABEL: add_v4i8_v4i32_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i32 q2, #0xff
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmlava.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i8> %x to <4 x i32>
%yy = zext <4 x i8> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, <4 x i8> %y, i32 %a) {
; CHECK-LABEL: add_v4i8_v4i32_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q1, q1
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmovlb.s16 q1, q1
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vmlava.u32 r0, q0, q1
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i8> %x to <4 x i32>
%yy = sext <4 x i8> %y to <4 x i32>
%m = mul <4 x i32> %xx, %yy
%z = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %m)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, <16 x i8> %y, i16 %a) {
; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.u8 r1, q1[8]
; CHECK-NEXT: vmov.16 q2[0], r1
; CHECK-NEXT: vmov.u8 r1, q1[9]
; CHECK-NEXT: vmov.16 q2[1], r1
; CHECK-NEXT: vmov.u8 r1, q1[10]
; CHECK-NEXT: vmov.16 q2[2], r1
; CHECK-NEXT: vmov.u8 r1, q1[11]
; CHECK-NEXT: vmov.16 q2[3], r1
; CHECK-NEXT: vmov.u8 r1, q1[12]
; CHECK-NEXT: vmov.16 q2[4], r1
; CHECK-NEXT: vmov.u8 r1, q1[13]
; CHECK-NEXT: vmov.16 q2[5], r1
; CHECK-NEXT: vmov.u8 r1, q1[14]
; CHECK-NEXT: vmov.16 q2[6], r1
; CHECK-NEXT: vmov.u8 r1, q1[15]
; CHECK-NEXT: vmov.16 q2[7], r1
; CHECK-NEXT: vmov.u8 r1, q0[8]
; CHECK-NEXT: vmov.16 q3[0], r1
; CHECK-NEXT: vmov.u8 r1, q0[9]
; CHECK-NEXT: vmov.16 q3[1], r1
; CHECK-NEXT: vmov.u8 r1, q0[10]
; CHECK-NEXT: vmov.16 q3[2], r1
; CHECK-NEXT: vmov.u8 r1, q0[11]
; CHECK-NEXT: vmov.16 q3[3], r1
; CHECK-NEXT: vmov.u8 r1, q0[12]
; CHECK-NEXT: vmov.16 q3[4], r1
; CHECK-NEXT: vmov.u8 r1, q0[13]
; CHECK-NEXT: vmov.16 q3[5], r1
; CHECK-NEXT: vmov.u8 r1, q0[14]
; CHECK-NEXT: vmov.16 q3[6], r1
; CHECK-NEXT: vmov.u8 r1, q0[15]
; CHECK-NEXT: vmov.16 q3[7], r1
; CHECK-NEXT: vmov.u8 r1, q1[0]
; CHECK-NEXT: vmullb.u8 q2, q3, q2
; CHECK-NEXT: vmov.16 q3[0], r1
; CHECK-NEXT: vmov.u8 r1, q1[1]
; CHECK-NEXT: vmov.16 q3[1], r1
; CHECK-NEXT: vmov.u8 r1, q1[2]
; CHECK-NEXT: vmov.16 q3[2], r1
; CHECK-NEXT: vmov.u8 r1, q1[3]
; CHECK-NEXT: vmov.16 q3[3], r1
; CHECK-NEXT: vmov.u8 r1, q1[4]
; CHECK-NEXT: vmov.16 q3[4], r1
; CHECK-NEXT: vmov.u8 r1, q1[5]
; CHECK-NEXT: vmov.16 q3[5], r1
; CHECK-NEXT: vmov.u8 r1, q1[6]
; CHECK-NEXT: vmov.16 q3[6], r1
; CHECK-NEXT: vmov.u8 r1, q1[7]
; CHECK-NEXT: vmov.16 q3[7], r1
; CHECK-NEXT: vmov.u8 r1, q0[0]
; CHECK-NEXT: vmov.16 q1[0], r1
; CHECK-NEXT: vmov.u8 r1, q0[1]
; CHECK-NEXT: vmov.16 q1[1], r1
; CHECK-NEXT: vmov.u8 r1, q0[2]
; CHECK-NEXT: vmov.16 q1[2], r1
; CHECK-NEXT: vmov.u8 r1, q0[3]
; CHECK-NEXT: vmov.16 q1[3], r1
; CHECK-NEXT: vmov.u8 r1, q0[4]
; CHECK-NEXT: vmov.16 q1[4], r1
; CHECK-NEXT: vmov.u8 r1, q0[5]
; CHECK-NEXT: vmov.16 q1[5], r1
; CHECK-NEXT: vmov.u8 r1, q0[6]
; CHECK-NEXT: vmov.16 q1[6], r1
; CHECK-NEXT: vmov.u8 r1, q0[7]
; CHECK-NEXT: vmov.16 q1[7], r1
; CHECK-NEXT: vmullb.u8 q0, q1, q3
; CHECK-NEXT: vadd.i16 q0, q0, q2
; CHECK-NEXT: vaddva.u16 r0, q0
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = zext <16 x i8> %x to <16 x i16>
%yy = zext <16 x i8> %y to <16 x i16>
%m = mul <16 x i16> %xx, %yy
%z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
%r = add i16 %z, %a
ret i16 %r
}
define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, <16 x i8> %y, i16 %a) {
; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.u8 r1, q1[8]
; CHECK-NEXT: vmov.16 q2[0], r1
; CHECK-NEXT: vmov.u8 r1, q1[9]
; CHECK-NEXT: vmov.16 q2[1], r1
; CHECK-NEXT: vmov.u8 r1, q1[10]
; CHECK-NEXT: vmov.16 q2[2], r1
; CHECK-NEXT: vmov.u8 r1, q1[11]
; CHECK-NEXT: vmov.16 q2[3], r1
; CHECK-NEXT: vmov.u8 r1, q1[12]
; CHECK-NEXT: vmov.16 q2[4], r1
; CHECK-NEXT: vmov.u8 r1, q1[13]
; CHECK-NEXT: vmov.16 q2[5], r1
; CHECK-NEXT: vmov.u8 r1, q1[14]
; CHECK-NEXT: vmov.16 q2[6], r1
; CHECK-NEXT: vmov.u8 r1, q1[15]
; CHECK-NEXT: vmov.16 q2[7], r1
; CHECK-NEXT: vmov.u8 r1, q0[8]
; CHECK-NEXT: vmov.16 q3[0], r1
; CHECK-NEXT: vmov.u8 r1, q0[9]
; CHECK-NEXT: vmov.16 q3[1], r1
; CHECK-NEXT: vmov.u8 r1, q0[10]
; CHECK-NEXT: vmov.16 q3[2], r1
; CHECK-NEXT: vmov.u8 r1, q0[11]
; CHECK-NEXT: vmov.16 q3[3], r1
; CHECK-NEXT: vmov.u8 r1, q0[12]
; CHECK-NEXT: vmov.16 q3[4], r1
; CHECK-NEXT: vmov.u8 r1, q0[13]
; CHECK-NEXT: vmov.16 q3[5], r1
; CHECK-NEXT: vmov.u8 r1, q0[14]
; CHECK-NEXT: vmov.16 q3[6], r1
; CHECK-NEXT: vmov.u8 r1, q0[15]
; CHECK-NEXT: vmov.16 q3[7], r1
; CHECK-NEXT: vmov.u8 r1, q1[0]
; CHECK-NEXT: vmullb.s8 q2, q3, q2
; CHECK-NEXT: vmov.16 q3[0], r1
; CHECK-NEXT: vmov.u8 r1, q1[1]
; CHECK-NEXT: vmov.16 q3[1], r1
; CHECK-NEXT: vmov.u8 r1, q1[2]
; CHECK-NEXT: vmov.16 q3[2], r1
; CHECK-NEXT: vmov.u8 r1, q1[3]
; CHECK-NEXT: vmov.16 q3[3], r1
; CHECK-NEXT: vmov.u8 r1, q1[4]
; CHECK-NEXT: vmov.16 q3[4], r1
; CHECK-NEXT: vmov.u8 r1, q1[5]
; CHECK-NEXT: vmov.16 q3[5], r1
; CHECK-NEXT: vmov.u8 r1, q1[6]
; CHECK-NEXT: vmov.16 q3[6], r1
; CHECK-NEXT: vmov.u8 r1, q1[7]
; CHECK-NEXT: vmov.16 q3[7], r1
; CHECK-NEXT: vmov.u8 r1, q0[0]
; CHECK-NEXT: vmov.16 q1[0], r1
; CHECK-NEXT: vmov.u8 r1, q0[1]
; CHECK-NEXT: vmov.16 q1[1], r1
; CHECK-NEXT: vmov.u8 r1, q0[2]
; CHECK-NEXT: vmov.16 q1[2], r1
; CHECK-NEXT: vmov.u8 r1, q0[3]
; CHECK-NEXT: vmov.16 q1[3], r1
; CHECK-NEXT: vmov.u8 r1, q0[4]
; CHECK-NEXT: vmov.16 q1[4], r1
; CHECK-NEXT: vmov.u8 r1, q0[5]
; CHECK-NEXT: vmov.16 q1[5], r1
; CHECK-NEXT: vmov.u8 r1, q0[6]
; CHECK-NEXT: vmov.16 q1[6], r1
; CHECK-NEXT: vmov.u8 r1, q0[7]
; CHECK-NEXT: vmov.16 q1[7], r1
; CHECK-NEXT: vmullb.s8 q0, q1, q3
; CHECK-NEXT: vadd.i16 q0, q0, q2
; CHECK-NEXT: vaddva.u16 r0, q0
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%yy = sext <16 x i8> %y to <16 x i16>
%m = mul <16 x i16> %xx, %yy
%z = call i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16> %m)
%r = add i16 %z, %a
ret i16 %r
}
define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, <8 x i8> %y, i16 %a) {
; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u8 q1, q1
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: vmlava.u16 r0, q0, q1
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i8> %x to <8 x i16>
%yy = zext <8 x i8> %y to <8 x i16>
%m = mul <8 x i16> %xx, %yy
%z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
%r = add i16 %z, %a
ret i16 %r
}
define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, <8 x i8> %y, i16 %a) {
; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q1, q1
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmlava.u16 r0, q0, q1
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i8> %x to <8 x i16>
%yy = sext <8 x i8> %y to <8 x i16>
%m = mul <8 x i16> %xx, %yy
%z = call i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16> %m)
%r = add i16 %z, %a
ret i16 %r
}
define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, <16 x i8> %y, i8 %a) {
; CHECK-LABEL: add_v16i8_v16i8_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmlava.u8 r0, q0, q1
; CHECK-NEXT: uxtb r0, r0
; CHECK-NEXT: bx lr
entry:
%m = mul <16 x i8> %x, %y
%z = call i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8> %m)
%r = add i8 %z, %a
ret i8 %r
}
define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, i64 %a) {
; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r7, lr}
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
; CHECK-NEXT: vmov.u8 r2, q1[0]
; CHECK-NEXT: vmov.u8 r3, q0[0]
; CHECK-NEXT: vmov.32 q3[0], r2
; CHECK-NEXT: vmov.u8 r2, q1[1]
; CHECK-NEXT: vmov.32 q4[0], r3
; CHECK-NEXT: vmov.u8 r3, q0[1]
; CHECK-NEXT: vmov.32 q3[2], r2
; CHECK-NEXT: vmov.i64 q2, #0xff
; CHECK-NEXT: vmov.32 q4[2], r3
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vand q4, q4, q2
; CHECK-NEXT: vmov r2, s14
; CHECK-NEXT: vmov r3, s18
; CHECK-NEXT: vmov.u8 r4, q0[2]
; CHECK-NEXT: umull r12, lr, r3, r2
; CHECK-NEXT: vmov r3, s16
; CHECK-NEXT: vmov r2, s12
; CHECK-NEXT: vmov.32 q4[0], r4
; CHECK-NEXT: vmov.u8 r4, q0[3]
; CHECK-NEXT: vmov.32 q4[2], r4
; CHECK-NEXT: vand q4, q4, q2
; CHECK-NEXT: vmov r4, s16
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: orr.w lr, lr, r3
; CHECK-NEXT: vmov.u8 r3, q1[2]
; CHECK-NEXT: vmov.32 q3[0], r3
; CHECK-NEXT: vmov.u8 r3, q1[3]
; CHECK-NEXT: vmov.32 q3[2], r3
; CHECK-NEXT: add r2, r12
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vmov r3, s12
; CHECK-NEXT: umull r3, r4, r4, r3
; CHECK-NEXT: vmov.32 q5[0], r3
; CHECK-NEXT: vmov r3, s14
; CHECK-NEXT: vmov.32 q5[1], r4
; CHECK-NEXT: vmov r4, s18
; CHECK-NEXT: umull r3, r4, r4, r3
; CHECK-NEXT: vmov.32 q5[2], r3
; CHECK-NEXT: vmov.32 q5[3], r4
; CHECK-NEXT: vmov r3, s20
; CHECK-NEXT: vmov r5, s21
; CHECK-NEXT: adds r2, r2, r3
; CHECK-NEXT: adc.w r3, lr, r5
; CHECK-NEXT: vmov r5, s22
; CHECK-NEXT: adds.w r12, r2, r5
; CHECK-NEXT: vmov.u8 r5, q1[4]
; CHECK-NEXT: adcs r3, r4
; CHECK-NEXT: vmov.u8 r4, q0[4]
; CHECK-NEXT: vmov.32 q3[0], r5
; CHECK-NEXT: vmov.u8 r5, q1[5]
; CHECK-NEXT: vmov.32 q4[0], r4
; CHECK-NEXT: vmov.u8 r4, q0[5]
; CHECK-NEXT: vmov.32 q3[2], r5
; CHECK-NEXT: vmov.32 q4[2], r4
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vand q4, q4, q2
; CHECK-NEXT: vmov r5, s12
; CHECK-NEXT: vmov r4, s16
; CHECK-NEXT: umull r5, r4, r4, r5
; CHECK-NEXT: vmov.32 q5[0], r5
; CHECK-NEXT: vmov r5, s14
; CHECK-NEXT: vmov.32 q5[1], r4
; CHECK-NEXT: vmov r4, s18
; CHECK-NEXT: umull r5, r4, r4, r5
; CHECK-NEXT: vmov.32 q5[2], r5
; CHECK-NEXT: vmov.32 q5[3], r4
; CHECK-NEXT: vmov r2, s20
; CHECK-NEXT: vmov r5, s21
; CHECK-NEXT: adds.w r2, r2, r12
; CHECK-NEXT: adcs r3, r5
; CHECK-NEXT: vmov r5, s22
; CHECK-NEXT: adds.w r12, r2, r5
; CHECK-NEXT: vmov.u8 r5, q1[6]
; CHECK-NEXT: adcs r3, r4
; CHECK-NEXT: vmov.u8 r4, q0[6]
; CHECK-NEXT: vmov.32 q3[0], r5
; CHECK-NEXT: vmov.u8 r5, q1[7]
; CHECK-NEXT: vmov.32 q4[0], r4
; CHECK-NEXT: vmov.u8 r4, q0[7]
; CHECK-NEXT: vmov.32 q3[2], r5
; CHECK-NEXT: vmov.32 q4[2], r4
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vand q4, q4, q2
; CHECK-NEXT: vmov r5, s12
; CHECK-NEXT: vmov r4, s16
; CHECK-NEXT: umull r5, r4, r4, r5
; CHECK-NEXT: vmov.32 q5[0], r5
; CHECK-NEXT: vmov r5, s14
; CHECK-NEXT: vmov.32 q5[1], r4
; CHECK-NEXT: vmov r4, s18
; CHECK-NEXT: umull r5, r4, r4, r5
; CHECK-NEXT: vmov.32 q5[2], r5
; CHECK-NEXT: vmov.32 q5[3], r4
; CHECK-NEXT: vmov r2, s20
; CHECK-NEXT: vmov r5, s21
; CHECK-NEXT: adds.w r2, r2, r12
; CHECK-NEXT: adcs r3, r5
; CHECK-NEXT: vmov r5, s22
; CHECK-NEXT: adds.w r12, r2, r5
; CHECK-NEXT: vmov.u8 r5, q1[8]
; CHECK-NEXT: adcs r3, r4
; CHECK-NEXT: vmov.u8 r4, q0[8]
; CHECK-NEXT: vmov.32 q3[0], r5
; CHECK-NEXT: vmov.u8 r5, q1[9]
; CHECK-NEXT: vmov.32 q4[0], r4
; CHECK-NEXT: vmov.u8 r4, q0[9]
; CHECK-NEXT: vmov.32 q3[2], r5
; CHECK-NEXT: vmov.32 q4[2], r4
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vand q4, q4, q2
; CHECK-NEXT: vmov r5, s12
; CHECK-NEXT: vmov r4, s16
; CHECK-NEXT: umull r5, r4, r4, r5
; CHECK-NEXT: vmov.32 q5[0], r5
; CHECK-NEXT: vmov r5, s14
; CHECK-NEXT: vmov.32 q5[1], r4
; CHECK-NEXT: vmov r4, s18
; CHECK-NEXT: umull r5, r4, r4, r5
; CHECK-NEXT: vmov.32 q5[2], r5
; CHECK-NEXT: vmov.32 q5[3], r4
; CHECK-NEXT: vmov r2, s20
; CHECK-NEXT: vmov r5, s21
; CHECK-NEXT: adds.w r2, r2, r12
; CHECK-NEXT: adcs r3, r5
; CHECK-NEXT: vmov r5, s22
; CHECK-NEXT: adds.w r12, r2, r5
; CHECK-NEXT: vmov.u8 r5, q1[10]
; CHECK-NEXT: adcs r3, r4
; CHECK-NEXT: vmov.u8 r4, q0[10]
; CHECK-NEXT: vmov.32 q3[0], r5
; CHECK-NEXT: vmov.u8 r5, q1[11]
; CHECK-NEXT: vmov.32 q4[0], r4
; CHECK-NEXT: vmov.u8 r4, q0[11]
; CHECK-NEXT: vmov.32 q3[2], r5
; CHECK-NEXT: vmov.32 q4[2], r4
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vand q4, q4, q2
; CHECK-NEXT: vmov r5, s12
; CHECK-NEXT: vmov r4, s16
; CHECK-NEXT: umull r5, r4, r4, r5
; CHECK-NEXT: vmov.32 q5[0], r5
; CHECK-NEXT: vmov r5, s14
; CHECK-NEXT: vmov.32 q5[1], r4
; CHECK-NEXT: vmov r4, s18
; CHECK-NEXT: umull r5, r4, r4, r5
; CHECK-NEXT: vmov.32 q5[2], r5
; CHECK-NEXT: vmov.32 q5[3], r4
; CHECK-NEXT: vmov r2, s20
; CHECK-NEXT: vmov r5, s21
; CHECK-NEXT: adds.w r2, r2, r12
; CHECK-NEXT: adcs r3, r5
; CHECK-NEXT: vmov r5, s22
; CHECK-NEXT: adds.w r12, r2, r5
; CHECK-NEXT: vmov.u8 r5, q1[12]
; CHECK-NEXT: adcs r3, r4
; CHECK-NEXT: vmov.u8 r4, q0[12]
; CHECK-NEXT: vmov.32 q3[0], r5
; CHECK-NEXT: vmov.u8 r5, q1[13]
; CHECK-NEXT: vmov.32 q4[0], r4
; CHECK-NEXT: vmov.u8 r4, q0[13]
; CHECK-NEXT: vmov.32 q3[2], r5
; CHECK-NEXT: vmov.32 q4[2], r4
; CHECK-NEXT: vand q3, q3, q2
; CHECK-NEXT: vand q4, q4, q2
; CHECK-NEXT: vmov r5, s12
; CHECK-NEXT: vmov r4, s16
; CHECK-NEXT: umull r5, r4, r4, r5
; CHECK-NEXT: vmov.32 q5[0], r5
; CHECK-NEXT: vmov r5, s14
; CHECK-NEXT: vmov.32 q5[1], r4
; CHECK-NEXT: vmov r4, s18
; CHECK-NEXT: umull r5, r4, r4, r5
; CHECK-NEXT: vmov.32 q5[2], r5
; CHECK-NEXT: vmov.32 q5[3], r4
; CHECK-NEXT: vmov r2, s20
; CHECK-NEXT: vmov r5, s21
; CHECK-NEXT: adds.w r2, r2, r12
; CHECK-NEXT: adcs r3, r5
; CHECK-NEXT: vmov r5, s22
; CHECK-NEXT: adds r2, r2, r5
; CHECK-NEXT: vmov.u8 r5, q1[14]
; CHECK-NEXT: vmov.32 q3[0], r5
; CHECK-NEXT: vmov.u8 r5, q1[15]
; CHECK-NEXT: adcs r3, r4
; CHECK-NEXT: vmov.32 q3[2], r5
; CHECK-NEXT: vmov.u8 r4, q0[14]
; CHECK-NEXT: vand q1, q3, q2
; CHECK-NEXT: vmov.32 q3[0], r4
; CHECK-NEXT: vmov.u8 r4, q0[15]
; CHECK-NEXT: vmov.32 q3[2], r4
; CHECK-NEXT: vmov r5, s4
; CHECK-NEXT: vand q0, q3, q2
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: umlal r2, r3, r4, r5
; CHECK-NEXT: vmov r5, s6
; CHECK-NEXT: vmov r4, s2
; CHECK-NEXT: umlal r2, r3, r4, r5
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: pop {r4, r5, r7, pc}
entry:
%xx = zext <16 x i8> %x to <16 x i64>
%yy = zext <16 x i8> %y to <16 x i64>
%m = mul <16 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, i64 %a) {
; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: vmov.u8 r2, q1[0]
; CHECK-NEXT: vmov.u8 r3, q0[0]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q2[0], r2
; CHECK-NEXT: vmov.u8 r2, q1[1]
; CHECK-NEXT: vmov.32 q2[1], r3
; CHECK-NEXT: vmov.u8 r3, q0[1]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q2[2], r2
; CHECK-NEXT: vmov.32 q2[3], r3
; CHECK-NEXT: vmov lr, s10
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: vmov r12, s9
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: vmov.u8 r2, q1[2]
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov.u8 r3, q0[2]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q2[0], r2
; CHECK-NEXT: vmov.u8 r2, q1[3]
; CHECK-NEXT: vmov.32 q2[1], r3
; CHECK-NEXT: vmov.u8 r3, q0[3]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmov.32 q2[2], r2
; CHECK-NEXT: vmov.32 q2[3], r3
; CHECK-NEXT: vmov r4, s8
; CHECK-NEXT: vmov r2, s9
; CHECK-NEXT: adds.w r4, r4, lr
; CHECK-NEXT: adc.w r12, r12, r2
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: adds.w lr, r4, r2
; CHECK-NEXT: vmov.u8 r4, q1[4]
; CHECK-NEXT: vmov.u8 r2, q0[4]
; CHECK-NEXT: sxtb r4, r4
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: smull r2, r4, r2, r4
; CHECK-NEXT: vmov.32 q2[0], r2
; CHECK-NEXT: vmov.u8 r2, q1[5]
; CHECK-NEXT: vmov.32 q2[1], r4
; CHECK-NEXT: vmov.u8 r4, q0[5]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r4, r4
; CHECK-NEXT: smull r2, r4, r4, r2
; CHECK-NEXT: vmov.32 q2[2], r2
; CHECK-NEXT: vmov.32 q2[3], r4
; CHECK-NEXT: vmov r3, s8
; CHECK-NEXT: vmov r2, s9
; CHECK-NEXT: adds.w r3, r3, lr
; CHECK-NEXT: adc.w r12, r12, r2
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: adds.w lr, r3, r2
; CHECK-NEXT: vmov.u8 r2, q0[6]
; CHECK-NEXT: adc.w r12, r12, r4
; CHECK-NEXT: vmov.u8 r4, q1[6]
; CHECK-NEXT: sxtb r4, r4
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: smull r2, r4, r2, r4
; CHECK-NEXT: vmov.32 q2[0], r2
; CHECK-NEXT: vmov.u8 r2, q1[7]
; CHECK-NEXT: vmov.32 q2[1], r4
; CHECK-NEXT: vmov.u8 r4, q0[7]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r4, r4
; CHECK-NEXT: smull r2, r4, r4, r2
; CHECK-NEXT: vmov.32 q2[2], r2
; CHECK-NEXT: vmov.32 q2[3], r4
; CHECK-NEXT: vmov r3, s8
; CHECK-NEXT: vmov r2, s9
; CHECK-NEXT: adds.w r3, r3, lr
; CHECK-NEXT: adc.w r12, r12, r2
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: adds.w lr, r3, r2
; CHECK-NEXT: vmov.u8 r2, q0[8]
; CHECK-NEXT: adc.w r12, r12, r4
; CHECK-NEXT: vmov.u8 r4, q1[8]
; CHECK-NEXT: sxtb r4, r4
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: smull r2, r4, r2, r4
; CHECK-NEXT: vmov.32 q2[0], r2
; CHECK-NEXT: vmov.u8 r2, q1[9]
; CHECK-NEXT: vmov.32 q2[1], r4
; CHECK-NEXT: vmov.u8 r4, q0[9]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r4, r4
; CHECK-NEXT: smull r2, r4, r4, r2
; CHECK-NEXT: vmov.32 q2[2], r2
; CHECK-NEXT: vmov.32 q2[3], r4
; CHECK-NEXT: vmov r3, s8
; CHECK-NEXT: vmov r2, s9
; CHECK-NEXT: adds.w r3, r3, lr
; CHECK-NEXT: adc.w r12, r12, r2
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: adds.w lr, r3, r2
; CHECK-NEXT: vmov.u8 r2, q0[10]
; CHECK-NEXT: adc.w r12, r12, r4
; CHECK-NEXT: vmov.u8 r4, q1[10]
; CHECK-NEXT: sxtb r4, r4
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: smull r2, r4, r2, r4
; CHECK-NEXT: vmov.32 q2[0], r2
; CHECK-NEXT: vmov.u8 r2, q1[11]
; CHECK-NEXT: vmov.32 q2[1], r4
; CHECK-NEXT: vmov.u8 r4, q0[11]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r4, r4
; CHECK-NEXT: smull r2, r4, r4, r2
; CHECK-NEXT: vmov.32 q2[2], r2
; CHECK-NEXT: vmov.32 q2[3], r4
; CHECK-NEXT: vmov r3, s8
; CHECK-NEXT: vmov r2, s9
; CHECK-NEXT: adds.w r3, r3, lr
; CHECK-NEXT: adc.w r12, r12, r2
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: adds.w lr, r3, r2
; CHECK-NEXT: vmov.u8 r2, q0[12]
; CHECK-NEXT: adc.w r12, r12, r4
; CHECK-NEXT: vmov.u8 r4, q1[12]
; CHECK-NEXT: sxtb r4, r4
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: smull r2, r4, r2, r4
; CHECK-NEXT: vmov.32 q2[0], r2
; CHECK-NEXT: vmov.u8 r2, q1[13]
; CHECK-NEXT: vmov.32 q2[1], r4
; CHECK-NEXT: vmov.u8 r4, q0[13]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r4, r4
; CHECK-NEXT: smull r2, r4, r4, r2
; CHECK-NEXT: vmov.32 q2[2], r2
; CHECK-NEXT: vmov.32 q2[3], r4
; CHECK-NEXT: vmov r3, s8
; CHECK-NEXT: vmov r2, s9
; CHECK-NEXT: adds.w r3, r3, lr
; CHECK-NEXT: adc.w r12, r12, r2
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: adds r2, r2, r3
; CHECK-NEXT: adc.w r3, r12, r4
; CHECK-NEXT: vmov.u8 r4, q1[14]
; CHECK-NEXT: sxtb.w r12, r4
; CHECK-NEXT: vmov.u8 r4, q0[14]
; CHECK-NEXT: sxtb r4, r4
; CHECK-NEXT: smlal r2, r3, r4, r12
; CHECK-NEXT: vmov.u8 r4, q1[15]
; CHECK-NEXT: sxtb.w r12, r4
; CHECK-NEXT: vmov.u8 r4, q0[15]
; CHECK-NEXT: sxtb r4, r4
; CHECK-NEXT: smlal r2, r3, r4, r12
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: pop {r4, pc}
entry:
%xx = sext <16 x i8> %x to <16 x i64>
%yy = sext <16 x i8> %y to <16 x i64>
%m = mul <16 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, i64 %a) {
; CHECK-LABEL: add_v2i8_v2i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov.i64 q2, #0xff
; CHECK-NEXT: vand q1, q1, q2
; CHECK-NEXT: vand q0, q0, q2
; CHECK-NEXT: vmov r2, s6
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: umull r12, lr, r3, r2
; CHECK-NEXT: vmov r2, s4
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: add r2, r12
; CHECK-NEXT: orr.w r3, r3, lr
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = zext <2 x i8> %x to <2 x i64>
%yy = zext <2 x i8> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, i64 %a) {
; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov r2, s4
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smull r2, r12, r3, r2
; CHECK-NEXT: vmov r3, s6
; CHECK-NEXT: sxtb.w lr, r3
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: smlal r2, r12, r3, lr
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r12
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = sext <2 x i8> %x to <2 x i64>
%yy = sext <2 x i8> %y to <2 x i64>
%m = mul <2 x i64> %xx, %yy
%z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, i64 %a) {
; CHECK-LABEL: add_v2i64_v2i64_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, lr}
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: vmov r2, s4
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: vmov r4, s5
; CHECK-NEXT: vmov r6, s7
; CHECK-NEXT: umull r12, lr, r3, r2
; CHECK-NEXT: mla r3, r3, r4, lr
; CHECK-NEXT: vmov r4, s1
; CHECK-NEXT: vmov.32 q2[0], r12
; CHECK-NEXT: mla r2, r4, r2, r3
; CHECK-NEXT: vmov r4, s6
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: vmov.32 q2[1], r2
; CHECK-NEXT: vmov r12, s8
; CHECK-NEXT: umull lr, r5, r3, r4
; CHECK-NEXT: mla r3, r3, r6, r5
; CHECK-NEXT: vmov r5, s3
; CHECK-NEXT: adds.w r6, r12, lr
; CHECK-NEXT: mla r3, r5, r4, r3
; CHECK-NEXT: adcs r2, r3
; CHECK-NEXT: adds r0, r0, r6
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: pop {r4, r5, r6, pc}
entry:
%m = mul <2 x i64> %x, %y
%z = call i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64> %m)
%r = add i64 %z, %a
ret i64 %r
}
declare i16 @llvm.experimental.vector.reduce.add.v16i16(<16 x i16>)
declare i16 @llvm.experimental.vector.reduce.add.v8i16(<8 x i16>)
declare i32 @llvm.experimental.vector.reduce.add.v16i32(<16 x i32>)
declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>)
declare i32 @llvm.experimental.vector.reduce.add.v8i32(<8 x i32>)
declare i64 @llvm.experimental.vector.reduce.add.v16i64(<16 x i64>)
declare i64 @llvm.experimental.vector.reduce.add.v2i64(<2 x i64>)
declare i64 @llvm.experimental.vector.reduce.add.v4i64(<4 x i64>)
declare i64 @llvm.experimental.vector.reduce.add.v8i64(<8 x i64>)
declare i8 @llvm.experimental.vector.reduce.add.v16i8(<16 x i8>)