From 8210c19b3b90c3bd87c152542e6be5331e842efe Mon Sep 17 00:00:00 2001 From: Jessica Paquette Date: Tue, 2 Jun 2020 09:30:04 -0700 Subject: [PATCH] [AArch64][GlobalISel] Select uzp1 and uzp2 Porting the mask stuff for uzp1 and uzp2 from AArch64ISelLowering. Add two custom opcodes: G_UZP1 and G_UZP2. Produce them in the post-legalizer combiner when the mask checks out. Tests: - postlegalizer-combiner-uzp.mir verifies that we create G_UZP1 and G_UZP2. The testcases that check that we create them come from neon-perm.ll. - select-uzp.mir verifies that we can select G_UZP1 and G_UZP2. Differential Revision: https://reviews.llvm.org/D81049 --- lib/Target/AArch64/AArch64Combine.td | 17 +- lib/Target/AArch64/AArch64InstrGISel.td | 16 ++ .../AArch64/AArch64PostLegalizerCombiner.cpp | 36 ++++- .../GlobalISel/postlegalizer-combiner-uzp.mir | 146 ++++++++++++++++++ .../CodeGen/AArch64/GlobalISel/select-uzp.mir | 53 +++++++ 5 files changed, 265 insertions(+), 3 deletions(-) create mode 100644 test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-uzp.mir create mode 100644 test/CodeGen/AArch64/GlobalISel/select-uzp.mir diff --git a/lib/Target/AArch64/AArch64Combine.td b/lib/Target/AArch64/AArch64Combine.td index 0e96a0785e5..21d4450d43a 100644 --- a/lib/Target/AArch64/AArch64Combine.td +++ b/lib/Target/AArch64/AArch64Combine.td @@ -29,11 +29,24 @@ def zip : GICombineRule< (defs root:$root, zip_matchdata:$matchinfo), (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, [{ return matchZip(*${root}, MRI, ${matchinfo}); }]), - (apply [{ applyZip(*${root}, ${matchinfo}); }]) + (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }]) >; +def uzp_matchdata : GIDefMatchData<"unsigned">; +def uzp : GICombineRule< + (defs root:$root, uzp_matchdata:$matchinfo), + (match (wip_match_opcode G_SHUFFLE_VECTOR):$root, + [{ return matchUZP(*${root}, MRI, ${matchinfo}); }]), + (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }]) +>; + +// Combines which replace a G_SHUFFLE_VECTOR with a target-specific pseudo +// instruction. +def shuffle_vector_pseudos : GICombineGroup<[zip, uzp]>; + def AArch64PostLegalizerCombinerHelper : GICombinerHelper<"AArch64GenPostLegalizerCombinerHelper", - [erase_undef_store, combines_for_extload, zip]> { + [erase_undef_store, combines_for_extload, + shuffle_vector_pseudos]> { let DisableRuleOption = "aarch64postlegalizercombiner-disable-rule"; } diff --git a/lib/Target/AArch64/AArch64InstrGISel.td b/lib/Target/AArch64/AArch64InstrGISel.td index e5b9546f859..650b0eee53c 100644 --- a/lib/Target/AArch64/AArch64InstrGISel.td +++ b/lib/Target/AArch64/AArch64InstrGISel.td @@ -25,6 +25,20 @@ def G_ADD_LOW : AArch64GenericInstruction { let hasSideEffects = 0; } +// Represents an uzp1 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_UZP1 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2); +} + +// Represents an uzp2 instruction. Produced post-legalization from +// G_SHUFFLE_VECTORs with appropriate masks. +def G_UZP2 : AArch64GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$v1, type0:$v2); +} + // Represents a zip1 instruction. Produced post-legalization from // G_SHUFFLE_VECTORs with appropriate masks. def G_ZIP1 : AArch64GenericInstruction { @@ -39,5 +53,7 @@ def G_ZIP2 : AArch64GenericInstruction { let InOperandList = (ins type0:$v1, type0:$v2); } +def : GINodeEquiv; +def : GINodeEquiv; def : GINodeEquiv; def : GINodeEquiv; diff --git a/lib/Target/AArch64/AArch64PostLegalizerCombiner.cpp b/lib/Target/AArch64/AArch64PostLegalizerCombiner.cpp index b53830cfe46..baafe080764 100644 --- a/lib/Target/AArch64/AArch64PostLegalizerCombiner.cpp +++ b/lib/Target/AArch64/AArch64PostLegalizerCombiner.cpp @@ -28,6 +28,21 @@ using namespace llvm; +/// Determines if \p M is a shuffle vector mask for a UZP of \p NumElts. +/// Whether or not G_UZP1 or G_UZP2 should be used is stored in \p WhichResult. +static bool isUZPMask(ArrayRef M, unsigned NumElts, + unsigned &WhichResult) { + WhichResult = (M[0] == 0 ? 0 : 1); + for (unsigned i = 0; i != NumElts; ++i) { + // Skip undef indices. + if (M[i] < 0) + continue; + if (static_cast(M[i]) != 2 * i + WhichResult) + return false; + } + return true; +} + /// \return true if \p M is a zip mask for a shuffle vector of \p NumElts. /// Whether or not G_ZIP1 or G_ZIP2 should be used is stored in \p WhichResult. static bool isZipMask(ArrayRef M, unsigned NumElts, @@ -47,6 +62,23 @@ static bool isZipMask(ArrayRef M, unsigned NumElts, return true; } +/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with +/// a G_UZP1 or G_UZP2 instruction. +/// +/// \param [in] MI - The shuffle vector instruction. +/// \param [out] Opc - Either G_UZP1 or G_UZP2 on success. +static bool matchUZP(MachineInstr &MI, MachineRegisterInfo &MRI, + unsigned &Opc) { + assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); + unsigned WhichResult; + ArrayRef ShuffleMask = MI.getOperand(3).getShuffleMask(); + unsigned NumElts = MRI.getType(MI.getOperand(0).getReg()).getNumElements(); + if (!isUZPMask(ShuffleMask, NumElts, WhichResult)) + return false; + Opc = (WhichResult == 0) ? AArch64::G_UZP1 : AArch64::G_UZP2; + return true; +} + static bool matchZip(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned &Opc) { assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); @@ -59,7 +91,9 @@ static bool matchZip(MachineInstr &MI, MachineRegisterInfo &MRI, return true; } -static bool applyZip(MachineInstr &MI, unsigned Opc) { +/// Replace a G_SHUFFLE_VECTOR instruction with a pseudo. +/// \p Opc is the opcode to use. \p MI is the G_SHUFFLE_VECTOR. +static bool applyShuffleVectorPseudo(MachineInstr &MI, unsigned Opc) { MachineIRBuilder MIRBuilder(MI); MIRBuilder.buildInstr(Opc, {MI.getOperand(0).getReg()}, {MI.getOperand(1).getReg(), MI.getOperand(2).getReg()}); diff --git a/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-uzp.mir b/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-uzp.mir new file mode 100644 index 00000000000..2717c6e21d4 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/postlegalizer-combiner-uzp.mir @@ -0,0 +1,146 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# Check that we can recognize a shuffle mask for a uzp instruction and produce +# a G_UZP1 or G_UZP2 where appropriate. +# +# RUN: llc -mtriple aarch64 -run-pass=aarch64-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s + +... +--- +name: uzp1_v4s32 +legalized: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $q0, $q1 + + ; CHECK-LABEL: name: uzp1_v4s32 + ; CHECK: liveins: $q0, $q1 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 + ; CHECK: [[UZP1_:%[0-9]+]]:_(<4 x s32>) = G_UZP1 [[COPY]], [[COPY1]] + ; CHECK: $q0 = COPY [[UZP1_]](<4 x s32>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:_(<4 x s32>) = COPY $q0 + %1:_(<4 x s32>) = COPY $q1 + %2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %1, shufflemask(0, 2, 4, 6) + $q0 = COPY %2(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: uzp2_v4s32 +legalized: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $q0, $q1 + + ; CHECK-LABEL: name: uzp2_v4s32 + ; CHECK: liveins: $q0, $q1 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 + ; CHECK: [[UZP2_:%[0-9]+]]:_(<4 x s32>) = G_UZP2 [[COPY]], [[UZP2_]] + ; CHECK: $q0 = COPY [[UZP2_]](<4 x s32>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:_(<4 x s32>) = COPY $q0 + %1:_(<4 x s32>) = COPY $q1 + %1:_(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %1, shufflemask(1, 3, 5, 7) + $q0 = COPY %1(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: no_uzp1 +legalized: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $q0, $q1 + + ; See isUZPMask: Mask[1] != 2 * i + 0 + + ; CHECK-LABEL: name: no_uzp1 + ; CHECK: liveins: $q0, $q1 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 + ; CHECK: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[COPY1]], shufflemask(0, 1, 4, 6) + ; CHECK: $q0 = COPY [[SHUF]](<4 x s32>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:_(<4 x s32>) = COPY $q0 + %1:_(<4 x s32>) = COPY $q1 + %2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %1, shufflemask(0, 1, 4, 6) + $q0 = COPY %2(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: no_uzp2 +legalized: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $q0, $q1 + + ; See isUZPMask: Mask[1] != 2 * i + 1 + + ; CHECK-LABEL: name: no_uzp2 + ; CHECK: liveins: $q0, $q1 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 + ; CHECK: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[COPY1]], shufflemask(1, 4, 5, 7) + ; CHECK: $q0 = COPY [[SHUF]](<4 x s32>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:_(<4 x s32>) = COPY $q0 + %1:_(<4 x s32>) = COPY $q1 + %2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %1, shufflemask(1, 4, 5, 7) + $q0 = COPY %2(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: uzp1_undef +legalized: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $q0, $q1 + + ; Make sure that we can still produce a uzp1/uzp2 with undef indices. + + ; CHECK-LABEL: name: uzp1_undef + ; CHECK: liveins: $q0, $q1 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 + ; CHECK: [[UZP1_:%[0-9]+]]:_(<4 x s32>) = G_UZP1 [[COPY]], [[COPY1]] + ; CHECK: $q0 = COPY [[UZP1_]](<4 x s32>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:_(<4 x s32>) = COPY $q0 + %1:_(<4 x s32>) = COPY $q1 + %2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %1, shufflemask(0, -1, 4, 6) + $q0 = COPY %2(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: uzp2_undef +legalized: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $q0, $q1 + + ; Make sure that we can still produce a uzp1/uzp2 with undef indices. + + ; CHECK-LABEL: name: uzp2_undef + ; CHECK: liveins: $q0, $q1 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 + ; CHECK: [[UZP2_:%[0-9]+]]:_(<4 x s32>) = G_UZP2 [[COPY]], [[UZP2_]] + ; CHECK: $q0 = COPY [[UZP2_]](<4 x s32>) + ; CHECK: RET_ReallyLR implicit $q0 + %0:_(<4 x s32>) = COPY $q0 + %1:_(<4 x s32>) = COPY $q1 + %1:_(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %1, shufflemask(1, 3, -1, 7) + $q0 = COPY %1(<4 x s32>) + RET_ReallyLR implicit $q0 diff --git a/test/CodeGen/AArch64/GlobalISel/select-uzp.mir b/test/CodeGen/AArch64/GlobalISel/select-uzp.mir new file mode 100644 index 00000000000..1d5affea6c1 --- /dev/null +++ b/test/CodeGen/AArch64/GlobalISel/select-uzp.mir @@ -0,0 +1,53 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# Check that we can select G_UZP1 and G_UZP2 via the tablegen importer. +# +# RUN: llc -mtriple aarch64 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s + +... +--- +name: uzp1_v4s32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $q0, $q1 + + ; CHECK-LABEL: name: uzp1_v4s32 + ; CHECK: liveins: $q0, $q1 + ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1 + ; CHECK: [[UZP1v4i32_:%[0-9]+]]:fpr128 = UZP1v4i32 [[COPY]], [[COPY1]] + ; CHECK: $q0 = COPY [[UZP1v4i32_]] + ; CHECK: RET_ReallyLR implicit $q0 + %0:fpr(<4 x s32>) = COPY $q0 + %1:fpr(<4 x s32>) = COPY $q1 + %2:fpr(<4 x s32>) = G_UZP1 %0, %1 + $q0 = COPY %2(<4 x s32>) + RET_ReallyLR implicit $q0 + +... +--- +name: uzp2_v4s32 +legalized: true +regBankSelected: true +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $q0, $q1 + + ; CHECK-LABEL: name: uzp2_v4s32 + ; CHECK: liveins: $q0, $q1 + ; CHECK: [[COPY:%[0-9]+]]:fpr128 = COPY $q0 + ; CHECK: [[COPY1:%[0-9]+]]:fpr128 = COPY $q1 + ; CHECK: [[UZP2v4i32_:%[0-9]+]]:fpr128 = UZP2v4i32 [[COPY]], [[COPY1]] + ; CHECK: $q0 = COPY [[UZP2v4i32_]] + ; CHECK: RET_ReallyLR implicit $q0 + %0:fpr(<4 x s32>) = COPY $q0 + %1:fpr(<4 x s32>) = COPY $q1 + %2:fpr(<4 x s32>) = G_UZP2 %0, %1 + $q0 = COPY %2(<4 x s32>) + RET_ReallyLR implicit $q0 + +...