1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 18:54:02 +01:00
llvm-mirror/test/CodeGen/Thumb2/mve-vecreduce-add.ll
David Green 48eaab481d [ARM] Extend more reductions during lowering
This relaxes the VMLAV and VADDV reduction recognition code to handle
smaller than legal types, extending them as needed. That was already
handled for some reductions, this extends it to more types in a more
generic way. If a smaller than legal value is found it is extended to
the legal type as needed.

Differential Revision: https://reviews.llvm.org/D106051
2021-07-19 08:58:03 +01:00

1466 lines
45 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
define arm_aapcs_vfpcc i32 @add_v4i32_v4i32(<4 x i32> %x) {
; CHECK-LABEL: add_v4i32_v4i32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vaddv.u32 r0, q0
; CHECK-NEXT: bx lr
entry:
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
ret i32 %z
}
define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_zext(<4 x i32> %x) {
; CHECK-LABEL: add_v4i32_v4i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vaddlv.u32 r0, r1, q0
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i32> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_sext(<4 x i32> %x) {
; CHECK-LABEL: add_v4i32_v4i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vaddlv.s32 r0, r1, q0
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i32> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x) {
; CHECK-LABEL: add_v2i32_v2i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i64 q1, #0xffffffff
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vmov r0, r1, d1
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: bx lr
entry:
%xx = zext <2 x i32> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x) {
; CHECK-LABEL: add_v2i32_v2i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: asrs r1, r0, #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: bx lr
entry:
%xx = sext <2 x i32> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_zext(<8 x i16> %x) {
; CHECK-LABEL: add_v8i16_v8i32_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vaddv.u16 r0, q0
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i16> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x) {
; CHECK-LABEL: add_v8i16_v8i32_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vaddv.s16 r0, q0
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i16> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_zext(<4 x i16> %x) {
; CHECK-LABEL: add_v4i16_v4i32_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vaddv.u32 r0, q0
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i16> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_sext(<4 x i16> %x) {
; CHECK-LABEL: add_v4i16_v4i32_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vaddv.u32 r0, q0
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i16> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
ret i32 %z
}
define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16(<8 x i16> %x) {
; CHECK-LABEL: add_v8i16_v8i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vaddv.u16 r0, q0
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
ret i16 %z
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x) {
; CHECK-LABEL: add_v8i16_v8i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.u16 r0, q0[1]
; CHECK-NEXT: vmov.u16 r1, q0[0]
; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
; CHECK-NEXT: vmov.i64 q1, #0xffff
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov.u16 r3, q0[2]
; CHECK-NEXT: vmov r0, r1, d5
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: vmov.u16 r2, q0[3]
; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
; CHECK-NEXT: vmov.u16 r3, q0[4]
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: vmov.u16 r2, q0[5]
; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: vmov r2, r3, d5
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: vmov.u16 r2, q0[7]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.u16 r3, q0[6]
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i16> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x) {
; CHECK-LABEL: add_v8i16_v8i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.s16 r0, q0[0]
; CHECK-NEXT: vmov.s16 r2, q0[1]
; CHECK-NEXT: asrs r1, r0, #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s16 r2, q0[2]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s16 r2, q0[3]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s16 r2, q0[4]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s16 r2, q0[5]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s16 r2, q0[6]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s16 r2, q0[7]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i16> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_zext(<4 x i16> %x) {
; CHECK-LABEL: add_v4i16_v4i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vaddlv.u32 r0, r1, q0
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i16> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_sext(<4 x i16> %x) {
; CHECK-LABEL: add_v4i16_v4i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vaddlv.s32 r0, r1, q0
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i16> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x) {
; CHECK-LABEL: add_v2i16_v2i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i64 q1, #0xffff
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vmov r0, r1, d1
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: bx lr
entry:
%xx = zext <2 x i16> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x) {
; CHECK-LABEL: add_v2i16_v2i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: asrs r1, r0, #31
; CHECK-NEXT: sxth r2, r2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: bx lr
entry:
%xx = sext <2 x i16> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x) {
; CHECK-LABEL: add_v16i8_v16i32_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vaddv.u8 r0, q0
; CHECK-NEXT: bx lr
entry:
%xx = zext <16 x i8> %x to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x) {
; CHECK-LABEL: add_v16i8_v16i32_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vaddv.s8 r0, q0
; CHECK-NEXT: bx lr
entry:
%xx = sext <16 x i8> %x to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_zext(<8 x i8> %x) {
; CHECK-LABEL: add_v8i8_v8i32_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: vaddv.u16 r0, q0
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i8> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_sext(<8 x i8> %x) {
; CHECK-LABEL: add_v8i8_v8i32_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vaddv.s16 r0, q0
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i8> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_zext(<4 x i8> %x) {
; CHECK-LABEL: add_v4i8_v4i32_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i32 q1, #0xff
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vaddv.u32 r0, q0
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i8> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
ret i32 %z
}
define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_sext(<4 x i8> %x) {
; CHECK-LABEL: add_v4i8_v4i32_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vaddv.u32 r0, q0
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i8> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
ret i32 %z
}
define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x) {
; CHECK-LABEL: add_v16i8_v16i16_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vaddv.u8 r0, q0
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = zext <16 x i8> %x to <16 x i16>
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
ret i16 %z
}
define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x) {
; CHECK-LABEL: add_v16i8_v16i16_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vaddv.s8 r0, q0
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
ret i16 %z
}
define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x) {
; CHECK-LABEL: add_v8i8_v8i16_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: vaddv.u16 r0, q0
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i8> %x to <8 x i16>
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
ret i16 %z
}
define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x) {
; CHECK-LABEL: add_v8i8_v8i16_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vaddv.u16 r0, q0
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i8> %x to <8 x i16>
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
ret i16 %z
}
define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8(<16 x i8> %x) {
; CHECK-LABEL: add_v16i8_v16i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vaddv.u8 r0, q0
; CHECK-NEXT: uxtb r0, r0
; CHECK-NEXT: bx lr
entry:
%z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
ret i8 %z
}
define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
; CHECK-LABEL: add_v16i8_v16i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.u8 r0, q0[1]
; CHECK-NEXT: vmov.u8 r1, q0[0]
; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
; CHECK-NEXT: vmov.i64 q1, #0xff
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov.u8 r3, q0[2]
; CHECK-NEXT: vmov r0, r1, d5
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: vmov.u8 r2, q0[3]
; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
; CHECK-NEXT: vmov.u8 r3, q0[4]
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: vmov.u8 r2, q0[5]
; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: vmov r2, r3, d5
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: vmov.u8 r2, q0[7]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.u8 r3, q0[6]
; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, r3, d4
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov r2, r3, d5
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: vmov.u8 r2, q0[9]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.u8 r3, q0[8]
; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, r3, d4
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov r2, r3, d5
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: vmov.u8 r2, q0[11]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.u8 r3, q0[10]
; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, r3, d4
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov r2, r3, d5
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: vmov.u8 r2, q0[13]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.u8 r3, q0[12]
; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, r3, d4
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov r2, r3, d5
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: vmov.u8 r2, q0[15]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.u8 r3, q0[14]
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: bx lr
entry:
%xx = zext <16 x i8> %x to <16 x i64>
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
; CHECK-LABEL: add_v16i8_v16i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.s8 r0, q0[0]
; CHECK-NEXT: vmov.s8 r2, q0[1]
; CHECK-NEXT: asrs r1, r0, #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[2]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[3]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[4]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[5]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[6]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[7]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[8]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[9]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[10]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[11]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[12]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[13]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[14]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[15]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: bx lr
entry:
%xx = sext <16 x i8> %x to <16 x i64>
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_zext(<8 x i8> %x) {
; CHECK-LABEL: add_v8i8_v8i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: vmov.i64 q1, #0xffff
; CHECK-NEXT: vmov.u16 r0, q0[1]
; CHECK-NEXT: vmov.u16 r1, q0[0]
; CHECK-NEXT: vmov q2[2], q2[0], r1, r0
; CHECK-NEXT: vmov.u16 r3, q0[2]
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r0, r1, d5
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: vmov.u16 r2, q0[3]
; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
; CHECK-NEXT: vmov.u16 r3, q0[4]
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: vmov r2, s10
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: vmov.u16 r2, q0[5]
; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: vmov r2, r3, d5
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: vmov.u16 r2, q0[7]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.u16 r3, q0[6]
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i8> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_sext(<8 x i8> %x) {
; CHECK-LABEL: add_v8i8_v8i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.u16 r0, q0[0]
; CHECK-NEXT: vmov.u16 r2, q0[1]
; CHECK-NEXT: sxtb r0, r0
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: asrs r1, r0, #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.u16 r2, q0[2]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.u16 r2, q0[3]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.u16 r2, q0[4]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.u16 r2, q0[5]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.u16 r2, q0[6]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.u16 r2, q0[7]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i8> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_zext(<4 x i8> %x) {
; CHECK-LABEL: add_v4i8_v4i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i32 q1, #0xff
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vaddlv.u32 r0, r1, q0
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i8> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_sext(<4 x i8> %x) {
; CHECK-LABEL: add_v4i8_v4i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vaddlv.s32 r0, r1, q0
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i8> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x) {
; CHECK-LABEL: add_v2i8_v2i64_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i64 q1, #0xff
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vmov r0, r1, d1
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: bx lr
entry:
%xx = zext <2 x i8> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x) {
; CHECK-LABEL: add_v2i8_v2i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: sxtb r0, r0
; CHECK-NEXT: asrs r1, r0, #31
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: bx lr
entry:
%xx = sext <2 x i8> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
ret i64 %z
}
define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x) {
; CHECK-LABEL: add_v2i64_v2i64:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r0, r1, d1
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: bx lr
entry:
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
ret i64 %z
}
define arm_aapcs_vfpcc i32 @add_v4i32_v4i32_acc(<4 x i32> %x, i32 %a) {
; CHECK-LABEL: add_v4i32_v4i32_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vaddva.u32 r0, q0
; CHECK-NEXT: bx lr
entry:
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, i64 %a) {
; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vaddlva.u32 r0, r1, q0
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i32> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) {
; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vaddlva.s32 r0, r1, q0
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i32> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, i64 %a) {
; CHECK-LABEL: add_v2i32_v2i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov.i64 q1, #0xffffffff
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vmov lr, r12, d1
; CHECK-NEXT: vmov r3, r2, d0
; CHECK-NEXT: adds.w r3, r3, lr
; CHECK-NEXT: adc.w r2, r2, r12
; CHECK-NEXT: adds r0, r0, r3
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = zext <2 x i32> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, i64 %a) {
; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: asr.w r12, r2, #31
; CHECK-NEXT: adds r2, r2, r3
; CHECK-NEXT: adc.w r3, r12, r3, asr #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: bx lr
entry:
%xx = sext <2 x i32> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) {
; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vaddva.u16 r0, q0
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i16> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) {
; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vaddva.s16 r0, q0
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i16> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, i32 %a) {
; CHECK-LABEL: add_v4i16_v4i32_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vaddva.u32 r0, q0
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i16> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, i32 %a) {
; CHECK-LABEL: add_v4i16_v4i32_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vaddva.u32 r0, q0
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i16> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, i16 %a) {
; CHECK-LABEL: add_v8i16_v8i16_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vaddva.u16 r0, q0
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
%r = add i16 %z, %a
ret i16 %r
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) {
; CHECK-LABEL: add_v8i16_v8i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov.u16 r2, q0[1]
; CHECK-NEXT: vmov.u16 r3, q0[0]
; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
; CHECK-NEXT: vmov.i64 q1, #0xffff
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, r12, d5
; CHECK-NEXT: vmov r3, s8
; CHECK-NEXT: add.w lr, r3, r2
; CHECK-NEXT: vmov.u16 r3, q0[3]
; CHECK-NEXT: vmov.u16 r2, q0[2]
; CHECK-NEXT: vmov q2[2], q2[0], r2, r3
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: vmov r3, s10
; CHECK-NEXT: add r2, lr
; CHECK-NEXT: add.w lr, r2, r3
; CHECK-NEXT: vmov.u16 r3, q0[5]
; CHECK-NEXT: vmov.u16 r2, q0[4]
; CHECK-NEXT: vmov q2[2], q2[0], r2, r3
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: add lr, r2
; CHECK-NEXT: vmov r3, r2, d5
; CHECK-NEXT: adds.w lr, lr, r3
; CHECK-NEXT: vmov.u16 r3, q0[6]
; CHECK-NEXT: adc.w r12, r12, r2
; CHECK-NEXT: vmov.u16 r2, q0[7]
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: adds.w r2, r2, lr
; CHECK-NEXT: adc.w r3, r3, r12
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = zext <8 x i16> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) {
; CHECK-LABEL: add_v8i16_v8i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov.s16 r2, q0[0]
; CHECK-NEXT: vmov.s16 r3, q0[1]
; CHECK-NEXT: asr.w r12, r2, #31
; CHECK-NEXT: adds.w lr, r2, r3
; CHECK-NEXT: vmov.s16 r2, q0[2]
; CHECK-NEXT: adc.w r3, r12, r3, asr #31
; CHECK-NEXT: adds.w r12, lr, r2
; CHECK-NEXT: adc.w r2, r3, r2, asr #31
; CHECK-NEXT: vmov.s16 r3, q0[3]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s16 r3, q0[4]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s16 r3, q0[5]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s16 r3, q0[6]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w lr, r2, r3, asr #31
; CHECK-NEXT: vmov.s16 r3, q0[7]
; CHECK-NEXT: adds.w r2, r12, r3
; CHECK-NEXT: adc.w r3, lr, r3, asr #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = sext <8 x i16> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_acc_zext(<4 x i16> %x, i64 %a) {
; CHECK-LABEL: add_v4i16_v4i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u16 q0, q0
; CHECK-NEXT: vaddlva.u32 r0, r1, q0
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i16> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_acc_sext(<4 x i16> %x, i64 %a) {
; CHECK-LABEL: add_v4i16_v4i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vaddlva.s32 r0, r1, q0
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i16> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) {
; CHECK-LABEL: add_v2i16_v2i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i64 q1, #0xffff
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vmov r2, r12, d1
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: add r2, r3
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r12
; CHECK-NEXT: bx lr
entry:
%xx = zext <2 x i16> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, i64 %a) {
; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: sxth r2, r2
; CHECK-NEXT: asr.w r12, r2, #31
; CHECK-NEXT: sxth r3, r3
; CHECK-NEXT: adds r2, r2, r3
; CHECK-NEXT: adc.w r3, r12, r3, asr #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: bx lr
entry:
%xx = sext <2 x i16> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, i32 %a) {
; CHECK-LABEL: add_v16i8_v16i32_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vaddva.u8 r0, q0
; CHECK-NEXT: bx lr
entry:
%xx = zext <16 x i8> %x to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, i32 %a) {
; CHECK-LABEL: add_v16i8_v16i32_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vaddva.s8 r0, q0
; CHECK-NEXT: bx lr
entry:
%xx = sext <16 x i8> %x to <16 x i32>
%z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_acc_zext(<8 x i8> %x, i32 %a) {
; CHECK-LABEL: add_v8i8_v8i32_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: vaddva.u16 r0, q0
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i8> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_acc_sext(<8 x i8> %x, i32 %a) {
; CHECK-LABEL: add_v8i8_v8i32_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vaddva.s16 r0, q0
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i8> %x to <8 x i32>
%z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, i32 %a) {
; CHECK-LABEL: add_v4i8_v4i32_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i32 q1, #0xff
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vaddva.u32 r0, q0
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i8> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, i32 %a) {
; CHECK-LABEL: add_v4i8_v4i32_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vaddva.u32 r0, q0
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i8> %x to <4 x i32>
%z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
%r = add i32 %z, %a
ret i32 %r
}
define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, i16 %a) {
; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vaddva.u8 r0, q0
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = zext <16 x i8> %x to <16 x i16>
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
%r = add i16 %z, %a
ret i16 %r
}
define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, i16 %a) {
; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vaddva.s8 r0, q0
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = sext <16 x i8> %x to <16 x i16>
%z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
%r = add i16 %z, %a
ret i16 %r
}
define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, i16 %a) {
; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: vaddva.u16 r0, q0
; CHECK-NEXT: uxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = zext <8 x i8> %x to <8 x i16>
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
%r = add i16 %z, %a
ret i16 %r
}
define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, i16 %a) {
; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vaddva.u16 r0, q0
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: bx lr
entry:
%xx = sext <8 x i8> %x to <8 x i16>
%z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
%r = add i16 %z, %a
ret i16 %r
}
define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, i8 %a) {
; CHECK-LABEL: add_v16i8_v16i8_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vaddva.u8 r0, q0
; CHECK-NEXT: uxtb r0, r0
; CHECK-NEXT: bx lr
entry:
%z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
%r = add i8 %z, %a
ret i8 %r
}
define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov.u8 r2, q0[1]
; CHECK-NEXT: vmov.u8 r3, q0[0]
; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
; CHECK-NEXT: vmov.i64 q1, #0xff
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, r12, d5
; CHECK-NEXT: vmov r3, s8
; CHECK-NEXT: add.w lr, r3, r2
; CHECK-NEXT: vmov.u8 r3, q0[3]
; CHECK-NEXT: vmov.u8 r2, q0[2]
; CHECK-NEXT: vmov q2[2], q2[0], r2, r3
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: vmov r3, s10
; CHECK-NEXT: add r2, lr
; CHECK-NEXT: add.w lr, r2, r3
; CHECK-NEXT: vmov.u8 r3, q0[5]
; CHECK-NEXT: vmov.u8 r2, q0[4]
; CHECK-NEXT: vmov q2[2], q2[0], r2, r3
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: add lr, r2
; CHECK-NEXT: vmov r3, r2, d5
; CHECK-NEXT: adds.w lr, lr, r3
; CHECK-NEXT: vmov.u8 r3, q0[6]
; CHECK-NEXT: adc.w r12, r12, r2
; CHECK-NEXT: vmov.u8 r2, q0[7]
; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, r3, d4
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov r2, r3, d5
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: vmov.u8 r2, q0[9]
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov.u8 r3, q0[8]
; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, r3, d4
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov r2, r3, d5
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: vmov.u8 r2, q0[11]
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov.u8 r3, q0[10]
; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, r3, d4
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov r2, r3, d5
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: vmov.u8 r2, q0[13]
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov.u8 r3, q0[12]
; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, r3, d4
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov r2, r3, d5
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: vmov.u8 r2, q0[15]
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov.u8 r3, q0[14]
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: adds.w r2, r2, lr
; CHECK-NEXT: adc.w r3, r3, r12
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = zext <16 x i8> %x to <16 x i64>
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov.s8 r2, q0[0]
; CHECK-NEXT: vmov.s8 r3, q0[1]
; CHECK-NEXT: asr.w r12, r2, #31
; CHECK-NEXT: adds.w lr, r2, r3
; CHECK-NEXT: vmov.s8 r2, q0[2]
; CHECK-NEXT: adc.w r3, r12, r3, asr #31
; CHECK-NEXT: adds.w r12, lr, r2
; CHECK-NEXT: adc.w r2, r3, r2, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[3]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[4]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[5]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[6]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[7]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[8]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[9]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[10]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[11]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[12]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[13]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[14]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w lr, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[15]
; CHECK-NEXT: adds.w r2, r12, r3
; CHECK-NEXT: adc.w r3, lr, r3, asr #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = sext <16 x i8> %x to <16 x i64>
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) {
; CHECK-LABEL: add_v8i8_v8i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmovlb.u8 q0, q0
; CHECK-NEXT: vmov.i64 q1, #0xffff
; CHECK-NEXT: vmov.u16 r2, q0[1]
; CHECK-NEXT: vmov.u16 r3, q0[0]
; CHECK-NEXT: vmov q2[2], q2[0], r3, r2
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, r12, d5
; CHECK-NEXT: vmov r3, s8
; CHECK-NEXT: add.w lr, r3, r2
; CHECK-NEXT: vmov.u16 r3, q0[3]
; CHECK-NEXT: vmov.u16 r2, q0[2]
; CHECK-NEXT: vmov q2[2], q2[0], r2, r3
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: vmov r3, s10
; CHECK-NEXT: add r2, lr
; CHECK-NEXT: add.w lr, r2, r3
; CHECK-NEXT: vmov.u16 r3, q0[5]
; CHECK-NEXT: vmov.u16 r2, q0[4]
; CHECK-NEXT: vmov q2[2], q2[0], r2, r3
; CHECK-NEXT: vand q2, q2, q1
; CHECK-NEXT: vmov r2, s8
; CHECK-NEXT: add lr, r2
; CHECK-NEXT: vmov r3, r2, d5
; CHECK-NEXT: adds.w lr, lr, r3
; CHECK-NEXT: vmov.u16 r3, q0[6]
; CHECK-NEXT: adc.w r12, r12, r2
; CHECK-NEXT: vmov.u16 r2, q0[7]
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: adds.w r2, r2, lr
; CHECK-NEXT: adc.w r3, r3, r12
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = zext <8 x i8> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_acc_sext(<8 x i8> %x, i64 %a) {
; CHECK-LABEL: add_v8i8_v8i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov.u16 r2, q0[0]
; CHECK-NEXT: vmov.u16 r3, q0[1]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: asr.w r12, r2, #31
; CHECK-NEXT: adds.w lr, r2, r3
; CHECK-NEXT: vmov.u16 r2, q0[2]
; CHECK-NEXT: adc.w r3, r12, r3, asr #31
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: adds.w r12, lr, r2
; CHECK-NEXT: adc.w r2, r3, r2, asr #31
; CHECK-NEXT: vmov.u16 r3, q0[3]
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.u16 r3, q0[4]
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.u16 r3, q0[5]
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.u16 r3, q0[6]
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w lr, r2, r3, asr #31
; CHECK-NEXT: vmov.u16 r3, q0[7]
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: adds.w r2, r12, r3
; CHECK-NEXT: adc.w r3, lr, r3, asr #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = sext <8 x i8> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_acc_zext(<4 x i8> %x, i64 %a) {
; CHECK-LABEL: add_v4i8_v4i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i32 q1, #0xff
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vaddlva.u32 r0, r1, q0
; CHECK-NEXT: bx lr
entry:
%xx = zext <4 x i8> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_acc_sext(<4 x i8> %x, i64 %a) {
; CHECK-LABEL: add_v4i8_v4i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmovlb.s8 q0, q0
; CHECK-NEXT: vmovlb.s16 q0, q0
; CHECK-NEXT: vaddlva.s32 r0, r1, q0
; CHECK-NEXT: bx lr
entry:
%xx = sext <4 x i8> %x to <4 x i64>
%z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) {
; CHECK-LABEL: add_v2i8_v2i64_acc_zext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.i64 q1, #0xff
; CHECK-NEXT: vand q0, q0, q1
; CHECK-NEXT: vmov r2, r12, d1
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: add r2, r3
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r12
; CHECK-NEXT: bx lr
entry:
%xx = zext <2 x i8> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, i64 %a) {
; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: asr.w r12, r2, #31
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: adds r2, r2, r3
; CHECK-NEXT: adc.w r3, r12, r3, asr #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: bx lr
entry:
%xx = sext <2 x i8> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
%r = add i64 %z, %a
ret i64 %r
}
define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, i64 %a) {
; CHECK-LABEL: add_v2i64_v2i64_acc:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov lr, r12, d1
; CHECK-NEXT: vmov r3, r2, d0
; CHECK-NEXT: adds.w r3, r3, lr
; CHECK-NEXT: adc.w r2, r2, r12
; CHECK-NEXT: adds r0, r0, r3
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: pop {r7, pc}
entry:
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
%r = add i64 %z, %a
ret i64 %r
}
declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)