1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[ARM] Expand VMOVRRD simplification pattern

This expands the VMOVRRD(extract(..(build_vector(a, b, c, d)))) pattern,
to also handle insert_vectors. Providing we can find the correct insert,
this helps further simplify patterns by removing the redundant VMOVRRD.

Differential Revision: https://reviews.llvm.org/D100245
This commit is contained in:
David Green 2021-04-26 12:27:38 +01:00
parent bd921d06d1
commit 746a7315fd
6 changed files with 577 additions and 1026 deletions

View File

@ -13695,22 +13695,51 @@ static SDValue PerformVMOVRRDCombine(SDNode *N,
}
// VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
// VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
isa<ConstantSDNode>(InDouble.getOperand(1))) {
SDValue BV = InDouble.getOperand(0);
// Look up through any nop bitcasts
while (BV.getOpcode() == ISD::BITCAST &&
(BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64))
// Look up through any nop bitcasts and vector_reg_casts. bitcasts may
// change lane order under big endian.
bool BVSwap = BV.getOpcode() == ISD::BITCAST;
while (
(BV.getOpcode() == ISD::BITCAST ||
BV.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
(BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
BVSwap = BV.getOpcode() == ISD::BITCAST;
BV = BV.getOperand(0);
if (BV.getValueType() != MVT::v4i32 || BV.getOpcode() != ISD::BUILD_VECTOR)
}
if (BV.getValueType() != MVT::v4i32)
return SDValue();
// Handle buildvectors, pulling out the correct lane depending on
// endianness.
unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
if (Subtarget->isLittle())
return DCI.DAG.getMergeValues(
{BV.getOperand(Offset), BV.getOperand(Offset + 1)}, SDLoc(N));
else
return DCI.DAG.getMergeValues(
{BV.getOperand(Offset + 1), BV.getOperand(Offset)}, SDLoc(N));
if (BV.getOpcode() == ISD::BUILD_VECTOR) {
SDValue Op0 = BV.getOperand(Offset);
SDValue Op1 = BV.getOperand(Offset + 1);
if (!Subtarget->isLittle() && BVSwap)
std::swap(Op0, Op1);
return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
}
// A chain of insert_vectors, grabbing the correct value of the chain of
// inserts.
SDValue Op0, Op1;
while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
if (isa<ConstantSDNode>(BV.getOperand(2))) {
if (BV.getConstantOperandVal(2) == Offset)
Op0 = BV.getOperand(1);
if (BV.getConstantOperandVal(2) == Offset + 1)
Op1 = BV.getOperand(1);
}
BV = BV.getOperand(0);
}
if (!Subtarget->isLittle() && BVSwap)
std::swap(Op0, Op1);
if (Op0 && Op1)
return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
}
return SDValue();

View File

@ -44,9 +44,14 @@ define <4 x i32> @h(<4 x i8> *%in) {
; CHECK: @ %bb.0:
; CHECK-NEXT: vld1.32 {d16[0]}, [r0:32]
; CHECK-NEXT: vmovl.u8 q8, d16
; CHECK-NEXT: vmovl.u16 q8, d16
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: vmov.u16 r0, d16[0]
; CHECK-NEXT: vmov.u16 r1, d16[1]
; CHECK-NEXT: vmov.u16 r2, d16[2]
; CHECK-NEXT: vmov.u16 r3, d16[3]
; CHECK-NEXT: uxtb r0, r0
; CHECK-NEXT: uxtb r1, r1
; CHECK-NEXT: uxtb r2, r2
; CHECK-NEXT: uxtb r3, r3
; CHECK-NEXT: bx lr
%1 = load <4 x i8>, <4 x i8>* %in, align 4
%2 = extractelement <4 x i8> %1, i32 0

View File

@ -100,9 +100,9 @@ define <8 x i16> @v_dupQ16(i16 %A) nounwind {
define <4 x i32> @v_dupQ32(i32 %A) nounwind {
; CHECK-LABEL: v_dupQ32:
; CHECK: @ %bb.0:
; CHECK-NEXT: vdup.32 q8, r0
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov r1, r0
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: mov r3, r0
; CHECK-NEXT: mov pc, lr
%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
%tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
@ -379,10 +379,9 @@ define void @redundantVdup(<8 x i8>* %ptr) nounwind {
define <4 x i32> @tdupi(i32 %x, i32 %y) {
; CHECK-LABEL: tdupi:
; CHECK: @ %bb.0:
; CHECK-NEXT: vdup.32 q8, r0
; CHECK-NEXT: vmov.32 d17[1], r1
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov r3, r1
; CHECK-NEXT: mov r1, r0
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: mov pc, lr
%1 = insertelement <4 x i32> undef, i32 %x, i32 0
%2 = insertelement <4 x i32> %1, i32 %x, i32 1
@ -412,11 +411,10 @@ define <4 x i32> @tduplane(<4 x i32> %invec) {
; CHECK-LABEL: tduplane:
; CHECK: @ %bb.0:
; CHECK-NEXT: vmov d16, r0, r1
; CHECK-NEXT: mov r0, #255
; CHECK-NEXT: vdup.32 q8, d16[1]
; CHECK-NEXT: vmov.32 d17[1], r0
; CHECK-NEXT: vmov r0, r1, d16
; CHECK-NEXT: vmov r2, r3, d17
; CHECK-NEXT: mov r3, #255
; CHECK-NEXT: vmov.32 r0, d16[1]
; CHECK-NEXT: mov r1, r0
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: mov pc, lr
%in = extractelement <4 x i32> %invec, i32 1
%1 = insertelement <4 x i32> undef, i32 %in, i32 0

View File

@ -524,19 +524,16 @@ entry:
define <4 x i32> @insertextract(i32 %x, i32 %y) {
; CHECK-LE-LABEL: insertextract:
; CHECK-LE: @ %bb.0:
; CHECK-LE-NEXT: vdup.32 q0, r0
; CHECK-LE-NEXT: vmov.32 q0[3], r1
; CHECK-LE-NEXT: vmov r0, r1, d0
; CHECK-LE-NEXT: vmov r2, r3, d1
; CHECK-LE-NEXT: mov r3, r1
; CHECK-LE-NEXT: mov r1, r0
; CHECK-LE-NEXT: mov r2, r0
; CHECK-LE-NEXT: bx lr
;
; CHECK-BE-LABEL: insertextract:
; CHECK-BE: @ %bb.0:
; CHECK-BE-NEXT: vdup.32 q0, r0
; CHECK-BE-NEXT: vmov.32 q0[3], r1
; CHECK-BE-NEXT: vrev64.32 q1, q0
; CHECK-BE-NEXT: vmov r1, r0, d2
; CHECK-BE-NEXT: vmov r3, r2, d3
; CHECK-BE-NEXT: mov r3, r1
; CHECK-BE-NEXT: mov r1, r0
; CHECK-BE-NEXT: mov r2, r0
; CHECK-BE-NEXT: bx lr
%1 = insertelement <4 x i32> undef, i32 %x, i32 0
%2 = insertelement <4 x i32> %1, i32 %x, i32 1

View File

@ -52,16 +52,11 @@ entry:
define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x) {
; CHECK-LABEL: add_v2i32_v2i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r1, s2
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: vmov q0[2], q0[0], r0, r1
; CHECK-NEXT: asrs r2, r1, #31
; CHECK-NEXT: asrs r0, r0, #31
; CHECK-NEXT: vmov q0[3], q0[1], r0, r2
; CHECK-NEXT: vmov r0, s2
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: asrs r1, r0, #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r3, r1, asr #31
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: bx lr
entry:
%xx = sext <2 x i32> %x to <2 x i64>
@ -174,51 +169,28 @@ entry:
define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x) {
; CHECK-LABEL: add_v8i16_v8i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.s16 r0, q0[1]
; CHECK-NEXT: vmov.s16 r1, q0[0]
; CHECK-NEXT: vmov q1[2], q1[0], r1, r0
; CHECK-NEXT: asrs r0, r0, #31
; CHECK-NEXT: asrs r1, r1, #31
; CHECK-NEXT: vmov q1[3], q1[1], r1, r0
; CHECK-NEXT: vmov r0, r1, d3
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: vmov.s16 r0, q0[0]
; CHECK-NEXT: vmov.s16 r2, q0[1]
; CHECK-NEXT: asrs r1, r0, #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s16 r2, q0[2]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s16 r2, q0[3]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.s16 r3, q0[2]
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov r2, r3, d3
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s16 r2, q0[4]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s16 r2, q0[5]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.s16 r3, q0[4]
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov r2, r3, d3
; CHECK-NEXT: adds.w r12, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s16 r2, q0[6]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s16 r2, q0[7]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.s16 r3, q0[6]
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: asrs r0, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q0[3], q0[1], r3, r0
; CHECK-NEXT: vmov r0, r3, d0
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: adds r0, r0, r3
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: bx lr
entry:
@ -269,18 +241,13 @@ entry:
define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x) {
; CHECK-LABEL: add_v2i16_v2i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r0, s2
; CHECK-NEXT: sxth r1, r0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: asrs r2, r1, #31
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: sxth r0, r0
; CHECK-NEXT: vmov q0[2], q0[0], r0, r1
; CHECK-NEXT: asrs r0, r0, #31
; CHECK-NEXT: vmov q0[3], q0[1], r0, r2
; CHECK-NEXT: vmov r0, s2
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: asrs r1, r0, #31
; CHECK-NEXT: sxth r2, r2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r3, r1, asr #31
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: bx lr
entry:
%xx = sext <2 x i16> %x to <2 x i64>
@ -525,99 +492,52 @@ entry:
define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
; CHECK-LABEL: add_v16i8_v16i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.s8 r0, q0[1]
; CHECK-NEXT: vmov.s8 r1, q0[0]
; CHECK-NEXT: vmov q1[2], q1[0], r1, r0
; CHECK-NEXT: asrs r0, r0, #31
; CHECK-NEXT: asrs r1, r1, #31
; CHECK-NEXT: vmov q1[3], q1[1], r1, r0
; CHECK-NEXT: vmov r0, r1, d3
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: vmov.s8 r0, q0[0]
; CHECK-NEXT: vmov.s8 r2, q0[1]
; CHECK-NEXT: asrs r1, r0, #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[2]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[3]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.s8 r3, q0[2]
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov r2, r3, d3
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[4]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[5]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.s8 r3, q0[4]
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov r2, r3, d3
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[6]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[7]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.s8 r3, q0[6]
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov r2, r3, d3
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[8]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[9]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.s8 r3, q0[8]
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov r2, r3, d3
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[10]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[11]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.s8 r3, q0[10]
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov r2, r3, d3
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[12]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[13]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.s8 r3, q0[12]
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov r2, r3, d3
; CHECK-NEXT: adds.w r12, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[14]
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.s8 r2, q0[15]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.s8 r3, q0[14]
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: asrs r0, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q0[3], q0[1], r3, r0
; CHECK-NEXT: vmov r0, r3, d0
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: adds r0, r0, r3
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: bx lr
entry:
@ -675,59 +595,36 @@ entry:
define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_sext(<8 x i8> %x) {
; CHECK-LABEL: add_v8i8_v8i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov.u16 r0, q0[1]
; CHECK-NEXT: vmov.u16 r1, q0[0]
; CHECK-NEXT: vmov.u16 r0, q0[0]
; CHECK-NEXT: vmov.u16 r2, q0[1]
; CHECK-NEXT: sxtb r0, r0
; CHECK-NEXT: sxtb r1, r1
; CHECK-NEXT: vmov q1[2], q1[0], r1, r0
; CHECK-NEXT: asrs r0, r0, #31
; CHECK-NEXT: asrs r1, r1, #31
; CHECK-NEXT: vmov q1[3], q1[1], r1, r0
; CHECK-NEXT: vmov r0, r1, d3
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: asrs r1, r0, #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.u16 r2, q0[2]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.u16 r2, q0[3]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.u16 r3, q0[2]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov r2, r3, d3
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.u16 r2, q0[4]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.u16 r2, q0[5]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.u16 r3, q0[4]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov r2, r3, d3
; CHECK-NEXT: adds.w r12, r0, r2
; CHECK-NEXT: vmov.u16 r2, q0[7]
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov.u16 r3, q0[6]
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.u16 r2, q0[6]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: asrs r0, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q0[3], q0[1], r3, r0
; CHECK-NEXT: vmov r0, r3, d0
; CHECK-NEXT: adds.w r0, r0, r12
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: adds r0, r0, r3
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: vmov.u16 r2, q0[7]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: bx lr
entry:
@ -780,18 +677,13 @@ entry:
define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x) {
; CHECK-LABEL: add_v2i8_v2i64_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r0, s2
; CHECK-NEXT: sxtb r1, r0
; CHECK-NEXT: vmov r0, s0
; CHECK-NEXT: asrs r2, r1, #31
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: sxtb r0, r0
; CHECK-NEXT: vmov q0[2], q0[0], r0, r1
; CHECK-NEXT: asrs r0, r0, #31
; CHECK-NEXT: vmov q0[3], q0[1], r0, r2
; CHECK-NEXT: vmov r0, s2
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: asrs r1, r0, #31
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adc.w r1, r3, r1, asr #31
; CHECK-NEXT: adc.w r1, r1, r2, asr #31
; CHECK-NEXT: bx lr
entry:
%xx = sext <2 x i8> %x to <2 x i64>
@ -871,21 +763,14 @@ entry:
define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, i64 %a) {
; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: asr.w r12, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q0[3], q0[1], r3, r12
; CHECK-NEXT: vmov r12, s2
; CHECK-NEXT: vmov r3, lr, d0
; CHECK-NEXT: adds.w r3, r3, r12
; CHECK-NEXT: adc.w r2, lr, r2, asr #31
; CHECK-NEXT: adds r0, r0, r3
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: adds r2, r2, r3
; CHECK-NEXT: adc.w r3, r12, r3, asr #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: bx lr
entry:
%xx = sext <2 x i32> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@ -1008,57 +893,34 @@ entry:
define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) {
; CHECK-LABEL: add_v8i16_v8i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: vmov.s16 r2, q0[1]
; CHECK-NEXT: vmov.s16 r3, q0[0]
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov lr, r12, d3
; CHECK-NEXT: vmov r3, r2, d2
; CHECK-NEXT: adds.w lr, lr, r3
; CHECK-NEXT: vmov.s16 r3, q0[2]
; CHECK-NEXT: adc.w r12, r12, r2
; CHECK-NEXT: vmov.s16 r2, q0[3]
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov r2, r3, d3
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: vmov.s16 r2, q0[5]
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov.s16 r2, q0[0]
; CHECK-NEXT: vmov.s16 r3, q0[1]
; CHECK-NEXT: asr.w r12, r2, #31
; CHECK-NEXT: adds.w lr, r2, r3
; CHECK-NEXT: vmov.s16 r2, q0[2]
; CHECK-NEXT: adc.w r3, r12, r3, asr #31
; CHECK-NEXT: adds.w r12, lr, r2
; CHECK-NEXT: adc.w r2, r3, r2, asr #31
; CHECK-NEXT: vmov.s16 r3, q0[3]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s16 r3, q0[4]
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov r2, r3, d3
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: vmov.s16 r2, q0[7]
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s16 r3, q0[5]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s16 r3, q0[6]
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: asrs r4, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q0[3], q0[1], r3, r4
; CHECK-NEXT: vmov r3, r4, d0
; CHECK-NEXT: adds.w lr, lr, r3
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: adc.w r4, r4, r12
; CHECK-NEXT: adds.w r3, r3, lr
; CHECK-NEXT: adc.w r2, r4, r2, asr #31
; CHECK-NEXT: adds r0, r0, r3
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w lr, r2, r3, asr #31
; CHECK-NEXT: vmov.s16 r3, q0[7]
; CHECK-NEXT: adds.w r2, r12, r3
; CHECK-NEXT: adc.w r3, lr, r3, asr #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = sext <8 x i16> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@ -1113,23 +975,16 @@ entry:
define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, i64 %a) {
; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: sxth r2, r2
; CHECK-NEXT: sxth r3, r3
; CHECK-NEXT: asr.w r12, r2, #31
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q0[3], q0[1], r3, r12
; CHECK-NEXT: vmov r12, s2
; CHECK-NEXT: vmov r3, lr, d0
; CHECK-NEXT: adds.w r3, r3, r12
; CHECK-NEXT: adc.w r2, lr, r2, asr #31
; CHECK-NEXT: adds r0, r0, r3
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: sxth r3, r3
; CHECK-NEXT: adds r2, r2, r3
; CHECK-NEXT: adc.w r3, r12, r3, asr #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: bx lr
entry:
%xx = sext <2 x i16> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
@ -1390,105 +1245,58 @@ entry:
define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: vmov.s8 r2, q0[1]
; CHECK-NEXT: vmov.s8 r3, q0[0]
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov lr, r12, d3
; CHECK-NEXT: vmov r3, r2, d2
; CHECK-NEXT: adds.w lr, lr, r3
; CHECK-NEXT: vmov.s8 r3, q0[2]
; CHECK-NEXT: adc.w r12, r12, r2
; CHECK-NEXT: vmov.s8 r2, q0[3]
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov r2, r3, d3
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: vmov.s8 r2, q0[5]
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov.s8 r2, q0[0]
; CHECK-NEXT: vmov.s8 r3, q0[1]
; CHECK-NEXT: asr.w r12, r2, #31
; CHECK-NEXT: adds.w lr, r2, r3
; CHECK-NEXT: vmov.s8 r2, q0[2]
; CHECK-NEXT: adc.w r3, r12, r3, asr #31
; CHECK-NEXT: adds.w r12, lr, r2
; CHECK-NEXT: adc.w r2, r3, r2, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[3]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[4]
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov r2, r3, d3
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: vmov.s8 r2, q0[7]
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[5]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[6]
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov r2, r3, d3
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: vmov.s8 r2, q0[9]
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[7]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[8]
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov r2, r3, d3
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: vmov.s8 r2, q0[11]
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[9]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[10]
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov r2, r3, d3
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: vmov.s8 r2, q0[13]
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[11]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[12]
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov r2, r3, d3
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: vmov.s8 r2, q0[15]
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[13]
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[14]
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: asrs r4, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q0[3], q0[1], r3, r4
; CHECK-NEXT: vmov r3, r4, d0
; CHECK-NEXT: adds.w lr, lr, r3
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: adc.w r4, r4, r12
; CHECK-NEXT: adds.w r3, r3, lr
; CHECK-NEXT: adc.w r2, r4, r2, asr #31
; CHECK-NEXT: adds r0, r0, r3
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w lr, r2, r3, asr #31
; CHECK-NEXT: vmov.s8 r3, q0[15]
; CHECK-NEXT: adds.w r2, r12, r3
; CHECK-NEXT: adc.w r3, lr, r3, asr #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = sext <16 x i8> %x to <16 x i64>
%z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
@ -1550,65 +1358,42 @@ entry:
define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_acc_sext(<8 x i8> %x, i64 %a) {
; CHECK-LABEL: add_v8i8_v8i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: vmov.u16 r2, q0[1]
; CHECK-NEXT: vmov.u16 r3, q0[0]
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov.u16 r2, q0[0]
; CHECK-NEXT: vmov.u16 r3, q0[1]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov lr, r12, d3
; CHECK-NEXT: vmov r3, r2, d2
; CHECK-NEXT: adds.w lr, lr, r3
; CHECK-NEXT: vmov.u16 r3, q0[2]
; CHECK-NEXT: adc.w r12, r12, r2
; CHECK-NEXT: vmov.u16 r2, q0[3]
; CHECK-NEXT: asr.w r12, r2, #31
; CHECK-NEXT: adds.w lr, r2, r3
; CHECK-NEXT: vmov.u16 r2, q0[2]
; CHECK-NEXT: adc.w r3, r12, r3, asr #31
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: adds.w r12, lr, r2
; CHECK-NEXT: adc.w r2, r3, r2, asr #31
; CHECK-NEXT: vmov.u16 r3, q0[3]
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov r2, r3, d3
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: vmov.u16 r2, q0[5]
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.u16 r3, q0[4]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: vmov q1[2], q1[0], r3, r2
; CHECK-NEXT: asrs r2, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q1[3], q1[1], r3, r2
; CHECK-NEXT: vmov r2, r3, d2
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: vmov r2, r3, d3
; CHECK-NEXT: adds.w lr, lr, r2
; CHECK-NEXT: vmov.u16 r2, q0[7]
; CHECK-NEXT: adc.w r12, r12, r3
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.u16 r3, q0[5]
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w r2, r2, r3, asr #31
; CHECK-NEXT: vmov.u16 r3, q0[6]
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: asrs r4, r2, #31
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q0[3], q0[1], r3, r4
; CHECK-NEXT: vmov r3, r4, d0
; CHECK-NEXT: adds.w lr, lr, r3
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: adc.w r4, r4, r12
; CHECK-NEXT: adds.w r3, r3, lr
; CHECK-NEXT: adc.w r2, r4, r2, asr #31
; CHECK-NEXT: adds r0, r0, r3
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: pop {r4, pc}
; CHECK-NEXT: adds.w r12, r12, r3
; CHECK-NEXT: adc.w lr, r2, r3, asr #31
; CHECK-NEXT: vmov.u16 r3, q0[7]
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: adds.w r2, r12, r3
; CHECK-NEXT: adc.w r3, lr, r3, asr #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: pop {r7, pc}
entry:
%xx = sext <8 x i8> %x to <8 x i64>
%z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
@ -1665,23 +1450,16 @@ entry:
define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, i64 %a) {
; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r7, lr}
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: vmov r2, s0
; CHECK-NEXT: vmov r3, s2
; CHECK-NEXT: sxtb r2, r2
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: asr.w r12, r2, #31
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: asrs r3, r3, #31
; CHECK-NEXT: vmov q0[3], q0[1], r3, r12
; CHECK-NEXT: vmov r12, s2
; CHECK-NEXT: vmov r3, lr, d0
; CHECK-NEXT: adds.w r3, r3, r12
; CHECK-NEXT: adc.w r2, lr, r2, asr #31
; CHECK-NEXT: adds r0, r0, r3
; CHECK-NEXT: adcs r1, r2
; CHECK-NEXT: pop {r7, pc}
; CHECK-NEXT: sxtb r3, r3
; CHECK-NEXT: adds r2, r2, r3
; CHECK-NEXT: adc.w r3, r12, r3, asr #31
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
; CHECK-NEXT: bx lr
entry:
%xx = sext <2 x i8> %x to <2 x i64>
%z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)

File diff suppressed because it is too large Load Diff