diff --git a/lib/Target/ARM/ARMInstrMVE.td b/lib/Target/ARM/ARMInstrMVE.td index aeeed7505b7..4548cb134b3 100644 --- a/lib/Target/ARM/ARMInstrMVE.td +++ b/lib/Target/ARM/ARMInstrMVE.td @@ -1561,6 +1561,60 @@ def MVE_VHSUBu8 : MVE_VHSUB<"u8", 0b1, 0b00>; def MVE_VHSUBu16 : MVE_VHSUB<"u16", 0b1, 0b01>; def MVE_VHSUBu32 : MVE_VHSUB<"u32", 0b1, 0b10>; +let Predicates = [HasMVEInt] in { + def : Pat<(v16i8 (ARMvshrsImm + (v16i8 (add (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)), + (v16i8 (MVE_VHADDs8 + (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>; + def : Pat<(v8i16 (ARMvshrsImm + (v8i16 (add (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)), + (v8i16 (MVE_VHADDs16 + (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>; + def : Pat<(v4i32 (ARMvshrsImm + (v4i32 (add (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)), + (v4i32 (MVE_VHADDs32 + (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>; + + def : Pat<(v16i8 (ARMvshruImm + (v16i8 (add (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)), + (v16i8 (MVE_VHADDu8 + (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>; + def : Pat<(v8i16 (ARMvshruImm + (v8i16 (add (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)), + (v8i16 (MVE_VHADDu16 + (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>; + def : Pat<(v4i32 (ARMvshruImm + (v4i32 (add (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)), + (v4i32 (MVE_VHADDu32 + (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>; + + def : Pat<(v16i8 (ARMvshrsImm + (v16i8 (sub (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)), + (v16i8 (MVE_VHSUBs8 + (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>; + def : Pat<(v8i16 (ARMvshrsImm + (v8i16 (sub (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)), + (v8i16 (MVE_VHSUBs16 + (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>; + def : Pat<(v4i32 (ARMvshrsImm + (v4i32 (sub (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)), + (v4i32 (MVE_VHSUBs32 + (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>; + + def : Pat<(v16i8 (ARMvshruImm + (v16i8 (sub (v16i8 MQPR:$v1), (v16i8 MQPR:$v2))), 1)), + (v16i8 (MVE_VHSUBu8 + (v16i8 MQPR:$v1), (v16i8 MQPR:$v2)))>; + def : Pat<(v8i16 (ARMvshruImm + (v8i16 (sub (v8i16 MQPR:$v1), (v8i16 MQPR:$v2))), 1)), + (v8i16 (MVE_VHSUBu16 + (v8i16 MQPR:$v1), (v8i16 MQPR:$v2)))>; + def : Pat<(v4i32 (ARMvshruImm + (v4i32 (sub (v4i32 MQPR:$v1), (v4i32 MQPR:$v2))), 1)), + (v4i32 (MVE_VHSUBu32 + (v4i32 MQPR:$v1), (v4i32 MQPR:$v2)))>; +} + class MVE_VDUP pattern=[]> : MVE_p<(outs MQPR:$Qd), (ins rGPR:$Rt), NoItinerary, "vdup", suffix, "$Qd, $Rt", vpred_r, "", pattern> { diff --git a/test/CodeGen/Thumb2/mve-vhaddsub.ll b/test/CodeGen/Thumb2/mve-vhaddsub.ll new file mode 100644 index 00000000000..19979f203f1 --- /dev/null +++ b/test/CodeGen/Thumb2/mve-vhaddsub.ll @@ -0,0 +1,281 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve %s -o - | FileCheck %s + +define arm_aapcs_vfpcc <16 x i8> @add_ashr_v16i8(<16 x i8> %src1, <16 x i8> %src2) { +; CHECK-LABEL: add_ashr_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhadd.s8 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = add <16 x i8> %src1, %src2 + %1 = ashr <16 x i8> %0, + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @add_ashr_v8i16(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: add_ashr_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhadd.s16 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = add <8 x i16> %src1, %src2 + %1 = ashr <8 x i16> %0, + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @add_ashr_v4i32(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: add_ashr_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhadd.s32 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = add nsw <4 x i32> %src1, %src2 + %1 = ashr <4 x i32> %0, + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <16 x i8> @add_lshr_v16i8(<16 x i8> %src1, <16 x i8> %src2) { +; CHECK-LABEL: add_lshr_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhadd.u8 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = add <16 x i8> %src1, %src2 + %1 = lshr <16 x i8> %0, + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @add_lshr_v8i16(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: add_lshr_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhadd.u16 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = add <8 x i16> %src1, %src2 + %1 = lshr <8 x i16> %0, + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @add_lshr_v4i32(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: add_lshr_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhadd.u32 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = add nsw <4 x i32> %src1, %src2 + %1 = lshr <4 x i32> %0, + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <16 x i8> @sub_ashr_v16i8(<16 x i8> %src1, <16 x i8> %src2) { +; CHECK-LABEL: sub_ashr_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhsub.s8 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = sub <16 x i8> %src1, %src2 + %1 = ashr <16 x i8> %0, + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @sub_ashr_v8i16(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: sub_ashr_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhsub.s16 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = sub <8 x i16> %src1, %src2 + %1 = ashr <8 x i16> %0, + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @sub_ashr_v4i32(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: sub_ashr_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhsub.s32 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = sub nsw <4 x i32> %src1, %src2 + %1 = ashr <4 x i32> %0, + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <16 x i8> @sub_lshr_v16i8(<16 x i8> %src1, <16 x i8> %src2) { +; CHECK-LABEL: sub_lshr_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhsub.u8 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = sub <16 x i8> %src1, %src2 + %1 = lshr <16 x i8> %0, + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @sub_lshr_v8i16(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: sub_lshr_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhsub.u16 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = sub <8 x i16> %src1, %src2 + %1 = lshr <8 x i16> %0, + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @sub_lshr_v4i32(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: sub_lshr_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhsub.u32 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = sub nsw <4 x i32> %src1, %src2 + %1 = lshr <4 x i32> %0, + ret <4 x i32> %1 +} + + + +define arm_aapcs_vfpcc <16 x i8> @add_sdiv_v16i8(<16 x i8> %src1, <16 x i8> %src2) { +; CHECK-LABEL: add_sdiv_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vadd.i8 q0, q0, q1 +; CHECK-NEXT: vshr.u8 q1, q0, #7 +; CHECK-NEXT: vhadd.s8 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = add <16 x i8> %src1, %src2 + %1 = sdiv <16 x i8> %0, + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @add_sdiv_v8i16(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: add_sdiv_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vshr.u16 q1, q0, #15 +; CHECK-NEXT: vhadd.s16 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = add <8 x i16> %src1, %src2 + %1 = sdiv <8 x i16> %0, + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @add_sdiv_v4i32(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: add_sdiv_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vshr.u32 q1, q0, #31 +; CHECK-NEXT: vhadd.s32 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = add nsw <4 x i32> %src1, %src2 + %1 = sdiv <4 x i32> %0, + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <16 x i8> @add_udiv_v16i8(<16 x i8> %src1, <16 x i8> %src2) { +; CHECK-LABEL: add_udiv_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhadd.u8 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = add <16 x i8> %src1, %src2 + %1 = udiv <16 x i8> %0, + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @add_udiv_v8i16(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: add_udiv_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhadd.u16 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = add <8 x i16> %src1, %src2 + %1 = udiv <8 x i16> %0, + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @add_udiv_v4i32(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: add_udiv_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhadd.u32 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = add nsw <4 x i32> %src1, %src2 + %1 = udiv <4 x i32> %0, + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <16 x i8> @sub_sdiv_v16i8(<16 x i8> %src1, <16 x i8> %src2) { +; CHECK-LABEL: sub_sdiv_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vsub.i8 q0, q0, q1 +; CHECK-NEXT: vshr.u8 q1, q0, #7 +; CHECK-NEXT: vhadd.s8 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = sub <16 x i8> %src1, %src2 + %1 = sdiv <16 x i8> %0, + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @sub_sdiv_v8i16(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: sub_sdiv_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vsub.i16 q0, q0, q1 +; CHECK-NEXT: vshr.u16 q1, q0, #15 +; CHECK-NEXT: vhadd.s16 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = sub <8 x i16> %src1, %src2 + %1 = sdiv <8 x i16> %0, + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @sub_sdiv_v4i32(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: sub_sdiv_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vsub.i32 q0, q0, q1 +; CHECK-NEXT: vshr.u32 q1, q0, #31 +; CHECK-NEXT: vhadd.s32 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = sub nsw <4 x i32> %src1, %src2 + %1 = sdiv <4 x i32> %0, + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <16 x i8> @sub_udiv_v16i8(<16 x i8> %src1, <16 x i8> %src2) { +; CHECK-LABEL: sub_udiv_v16i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhsub.u8 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = sub <16 x i8> %src1, %src2 + %1 = udiv <16 x i8> %0, + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @sub_udiv_v8i16(<8 x i16> %src1, <8 x i16> %src2) { +; CHECK-LABEL: sub_udiv_v8i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhsub.u16 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = sub <8 x i16> %src1, %src2 + %1 = udiv <8 x i16> %0, + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @sub_udiv_v4i32(<4 x i32> %src1, <4 x i32> %src2) { +; CHECK-LABEL: sub_udiv_v4i32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vhsub.u32 q0, q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = sub nsw <4 x i32> %src1, %src2 + %1 = udiv <4 x i32> %0, + ret <4 x i32> %1 +} +