[PowerPC] Reduce symmetrical swaps for lane-insensitive vector ops

This patch simplifies pattern (xxswap (vec-op (xxswap a) (xxswap b))) into (vec-op a b) if vec-op is lane-insensitive. The motivating case is ScalarToVector-VecOp-ExtractElement sequence on LE, but the peephole itself is not related to endianness, so BE may also benefit from this. Reviewed By: nemanjai Differential Revision: https://reviews.llvm.org/D97658
2024-11-22 18:54:02 +01:00 · 2021-03-10 15:21:32 +08:00 · 2021-03-10 15:21:32 +08:00 · b78bd4345c
commit b78bd4345c
parent f70507366d
2 changed files with 157 additions and 0 deletions
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@ -6717,6 +6717,102 @@ void PPCDAGToDAGISel::PeepholePPC64ZExt() {
    CurDAG->RemoveDeadNodes();
 }

+static bool isVSXSwap(SDValue N) {
+  if (!N->isMachineOpcode())
+    return false;
+  unsigned Opc = N->getMachineOpcode();
+
+  // Single-operand XXPERMDI or the regular XXPERMDI/XXSLDWI where the immediate
+  // operand is 2.
+  if (Opc == PPC::XXPERMDIs) {
+    return isa<ConstantSDNode>(N->getOperand(1)) &&
+           N->getConstantOperandVal(1) == 2;
+  } else if (Opc == PPC::XXPERMDI || Opc == PPC::XXSLDWI) {
+    return N->getOperand(0) == N->getOperand(1) &&
+           isa<ConstantSDNode>(N->getOperand(2)) &&
+           N->getConstantOperandVal(2) == 2;
+  }
+
+  return false;
+}
+
+// TODO: Make this complete and replace with a table-gen bit.
+static bool isLaneInsensitive(SDValue N) {
+  if (!N->isMachineOpcode())
+    return false;
+  unsigned Opc = N->getMachineOpcode();
+
+  switch (Opc) {
+  default:
+    return false;
+  case PPC::VAVGSB:
+  case PPC::VAVGUB:
+  case PPC::VAVGSH:
+  case PPC::VAVGUH:
+  case PPC::VAVGSW:
+  case PPC::VAVGUW:
+  case PPC::VMAXFP:
+  case PPC::VMAXSB:
+  case PPC::VMAXUB:
+  case PPC::VMAXSH:
+  case PPC::VMAXUH:
+  case PPC::VMAXSW:
+  case PPC::VMAXUW:
+  case PPC::VMINFP:
+  case PPC::VMINSB:
+  case PPC::VMINUB:
+  case PPC::VMINSH:
+  case PPC::VMINUH:
+  case PPC::VMINSW:
+  case PPC::VMINUW:
+  case PPC::VADDFP:
+  case PPC::VADDUBM:
+  case PPC::VADDUHM:
+  case PPC::VADDUWM:
+  case PPC::VSUBFP:
+  case PPC::VSUBUBM:
+  case PPC::VSUBUHM:
+  case PPC::VSUBUWM:
+  case PPC::VAND:
+  case PPC::VANDC:
+  case PPC::VOR:
+  case PPC::VORC:
+  case PPC::VXOR:
+  case PPC::VNOR:
+  case PPC::VMULUWM:
+    return true;
+  }
+}
+
+// Try to simplify (xxswap (vec-op (xxswap) (xxswap))) where vec-op is
+// lane-insensitive.
+static void reduceVSXSwap(SDNode *N, SelectionDAG *DAG) {
+  // Our desired xxswap might be source of COPY_TO_REGCLASS.
+  // TODO: Can we put this a common method for DAG?
+  auto SkipRCCopy = [](SDValue V) {
+    while (V->isMachineOpcode() &&
+           V->getMachineOpcode() == TargetOpcode::COPY_TO_REGCLASS)
+      V = V->getOperand(0);
+    return V;
+  };
+
+  SDValue VecOp = SkipRCCopy(N->getOperand(0));
+  if (!isLaneInsensitive(VecOp))
+    return;
+
+  SDValue LHS = SkipRCCopy(VecOp.getOperand(0)),
+          RHS = SkipRCCopy(VecOp.getOperand(1));
+  if (!LHS.hasOneUse() || !RHS.hasOneUse() || !isVSXSwap(LHS) ||
+      !isVSXSwap(RHS))
+    return;
+
+  // These swaps may still have chain-uses here, count on dead code elimination
+  // in following passes to remove them.
+  DAG->ReplaceAllUsesOfValueWith(LHS, LHS.getOperand(0));
+  DAG->ReplaceAllUsesOfValueWith(RHS, RHS.getOperand(0));
+  DAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), N->getOperand(0));
+}
+
 void PPCDAGToDAGISel::PeepholePPC64() {
  SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();

@ -6726,6 +6822,9 @@ void PPCDAGToDAGISel::PeepholePPC64() {
    if (N->use_empty() || !N->isMachineOpcode())
      continue;

+    if (isVSXSwap(SDValue(N, 0)))
+      reduceVSXSwap(N, CurDAG);
+
    unsigned FirstOp;
    unsigned StorageOpcode = N->getMachineOpcode();
    bool RequiresMod4Offset = false;
--- a/test/CodeGen/PowerPC/swap-reduction.ll
+++ b/test/CodeGen/PowerPC/swap-reduction.ll
@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le < %s | FileCheck %s
+
+define i64 @test1(i64* %a, i64* %b) {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mr 5, 3
+; CHECK-NEXT:    ld 3, 0(3)
+; CHECK-NEXT:    ld 4, 0(4)
+; CHECK-NEXT:    mtvsrd 34, 3
+; CHECK-NEXT:    add 3, 3, 4
+; CHECK-NEXT:    mtvsrd 35, 4
+; CHECK-NEXT:    vavgsb 2, 2, 3
+; CHECK-NEXT:    stxsdx 34, 0, 5
+; CHECK-NEXT:    blr
+entry:
+  %lhs = load i64, i64* %a, align 8
+  %rhs = load i64, i64* %b, align 8
+  %sum = add i64 %lhs, %rhs
+  %lv = insertelement <2 x i64> undef, i64 %lhs, i32 0
+  %rv = insertelement <2 x i64> undef, i64 %rhs, i32 0
+  %lhc = bitcast <2 x i64> %lv to <16 x i8>
+  %rhc = bitcast <2 x i64> %rv to <16 x i8>
+  %add = call <16 x i8> @llvm.ppc.altivec.vavgsb(<16 x i8> %lhc, <16 x i8> %rhc)
+  %cb = bitcast <16 x i8> %add to <2 x i64>
+  %fv = extractelement <2 x i64> %cb, i32 0
+  store i64 %fv, i64* %a, align 8
+  ret i64 %sum
+}
+
+define i64 @test2(i64* %a, i64* %b) {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mr 5, 3
+; CHECK-NEXT:    ld 3, 0(3)
+; CHECK-NEXT:    ld 4, 0(4)
+; CHECK-NEXT:    mtvsrd 34, 3
+; CHECK-NEXT:    add 3, 3, 4
+; CHECK-NEXT:    mtvsrd 35, 4
+; CHECK-NEXT:    vadduhm 2, 2, 3
+; CHECK-NEXT:    stxsdx 34, 0, 5
+; CHECK-NEXT:    blr
+entry:
+  %lhs = load i64, i64* %a, align 8
+  %rhs = load i64, i64* %b, align 8
+  %sum = add i64 %lhs, %rhs
+  %lv = insertelement <2 x i64> undef, i64 %lhs, i32 0
+  %rv = insertelement <2 x i64> undef, i64 %rhs, i32 0
+  %lhc = bitcast <2 x i64> %lv to <8 x i16>
+  %rhc = bitcast <2 x i64> %rv to <8 x i16>
+  %add = add <8 x i16> %lhc, %rhc
+  %cb = bitcast <8 x i16> %add to <2 x i64>
+  %fv = extractelement <2 x i64> %cb, i32 0
+  store i64 %fv, i64* %a, align 8
+  ret i64 %sum
+}
+
+declare <16 x i8> @llvm.ppc.altivec.vavgsb(<16 x i8>, <16 x i8>)