[PowerPC] Try to simplify a Swap if it feeds a Splat

If we have the situation where a Swap feeds a Splat we can sometimes change the index on the Splat and then remove the Swap instruction. Fixed the test case that was failing and recommit after pulling the original commit. Original revision is here: https://reviews.llvm.org/D39009 llvm-svn: 316478
2025-01-31 12:41:49 +01:00 · 2017-10-24 17:44:27 +00:00 · 2017-10-24 17:44:27 +00:00 · 1e1dcf2d50
commit 1e1dcf2d50
parent 463fb86f87
3 changed files with 183 additions and 2 deletions
--- a/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/lib/Target/PowerPC/PPCMIPeephole.cpp
@ -375,6 +375,53 @@ bool PPCMIPeephole::simplifyCode(void) {
            MI.getOperand(2).setImm(NewElem);
          }
        }
+
+        // Splat is fed by a SWAP which is a permute of this form
+        //  XXPERMDI %VA, %VA, 2
+        // Since the splat instruction can use any of the vector elements to do
+        //  the splat we do not have to rearrange the elements in the vector
+        //  with a swap before we do the splat. We can simply do the splat from
+        //  a different index.
+        // If the swap has only one use (the splat) then we can completely
+        //  remove the swap too.
+        if (DefOpcode == PPC::XXPERMDI && MI.getOperand(1).isImm()) {
+          unsigned SwapRes = DefMI->getOperand(0).getReg();
+          unsigned SwapOp1 = DefMI->getOperand(1).getReg();
+          unsigned SwapOp2 = DefMI->getOperand(2).getReg();
+          unsigned SwapImm = DefMI->getOperand(3).getImm();
+          unsigned SplatImm = MI.getOperand(1).getImm();
+
+          // Break if this permute is not a swap.
+          if (SwapOp1 != SwapOp2 || SwapImm != 2)
+            break;
+
+          unsigned NewElem = 0;
+          // Compute the new index to use for the splat.
+          if (MI.getOpcode() == PPC::VSPLTB)
+            NewElem = (SplatImm + 8) & 0xF;
+          else if (MI.getOpcode() == PPC::VSPLTH)
+            NewElem = (SplatImm + 4) & 0x7;
+          else if (MI.getOpcode() == PPC::XXSPLTW)
+            NewElem = (SplatImm + 2) & 0x3;
+          else {
+            DEBUG(dbgs() << "Unknown splat opcode.");
+            DEBUG(MI.dump());
+            break;
+          }
+
+          if (MRI->hasOneNonDBGUse(SwapRes)) {
+            DEBUG(dbgs() << "Removing redundant swap: ");
+            DEBUG(DefMI->dump());
+            ToErase = DefMI;
+          }
+          Simplified = true;
+          DEBUG(dbgs() << "Changing splat immediate from " << SplatImm <<
+                " to " << NewElem << " in instruction: ");
+          DEBUG(MI.dump());
+          MI.getOperand(1).setImm(NewElem);
+          MI.getOperand(2).setReg(SwapOp1);
+        }
+
        break;
      }
      case PPC::XVCVDPSP: {
--- a/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
+++ b/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
@ -16,7 +16,7 @@ entry:
 ; CHECK: sldi [[REG1:[0-9]+]], 3, 56
 ; CHECK: mtvsrd {{[0-9]+}}, [[REG1]]
 ; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3
-; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]]
+; CHECK-LE: vspltb {{[0-9]+}}, {{[0-9]+}}, 7
 }

 ; Function Attrs: norecurse nounwind readnone
@ -28,7 +28,7 @@ entry:
 ; CHECK: sldi [[REG1:[0-9]+]], 3, 48
 ; CHECK: mtvsrd {{[0-9]+}}, [[REG1]]
 ; CHECK-LE: mtvsrd [[REG1:[0-9]+]], 3
-; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]]
+; CHECK-LE: vsplth {{[0-9]+}}, {{[0-9]+}}, 3
 }

 ; Function Attrs: norecurse nounwind readnone
--- a/test/CodeGen/PowerPC/ppc64-peephole-swap.ll
+++ b/test/CodeGen/PowerPC/ppc64-peephole-swap.ll
@ -0,0 +1,134 @@
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -verify-machineinstrs | FileCheck %s -check-prefix=CHECK-PWR8
+
+; The strightforward expansion of this code will result in a swap followed by a
+;  splat. However, the swap is not needed since in this case the splat is the
+;  only use.
+; We want to check that we are not using the swap and that we have indexed the
+;  splat to the correct location.
+; 8 Bit Signed Version of the test.
+; Function Attrs: norecurse nounwind readnone
+define <16 x i8> @splat_8_plus(<16 x i8> %v, i8 signext %c) local_unnamed_addr {
+entry:
+  %splat.splatinsert.i = insertelement <16 x i8> undef, i8 %c, i32 0
+  %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> undef, <16 x i32> zeroinitializer
+  %add = add <16 x i8> %splat.splat.i, %v
+  ret <16 x i8> %add
+; CHECK-LABEL: splat_8_plus
+; CHECK-NOT: xxswapd
+; CHECK: vspltb {{[0-9]+}}, {{[0-9]+}}, 7
+; CHECK: blr
+; CHECK-PWR8-LABEL: splat_8_plus
+; CHECK-PWR8-NOT: xxswapd
+; CHECK-PWR8: vspltb {{[0-9]+}}, {{[0-9]+}}, 7
+; CHECK-PWR8: blr
+}
+
+; 8 Bit Unsigned Version of the test.
+; Function Attrs: norecurse nounwind readnone
+define <16 x i8> @splat_u8_plus(<16 x i8> %v, i8 zeroext %c) local_unnamed_addr {
+entry:
+  %splat.splatinsert.i = insertelement <16 x i8> undef, i8 %c, i32 0
+  %splat.splat.i = shufflevector <16 x i8> %splat.splatinsert.i, <16 x i8> undef, <16 x i32> zeroinitializer
+  %add = add <16 x i8> %splat.splat.i, %v
+  ret <16 x i8> %add
+; CHECK-LABEL: splat_u8_plus
+; CHECK-NOT: xxswapd
+; CHECK: vspltb {{[0-9]+}}, {{[0-9]+}}, 7
+; CHECK: blr
+; CHECK-PWR8-LABEL: splat_u8_plus
+; CHECK-PWR8-NOT: xxswapd
+; CHECK-PWR8: vspltb {{[0-9]+}}, {{[0-9]+}}, 7
+; CHECK-PWR8: blr
+}
+
+; 16 Bit Signed Version of the test.
+; Function Attrs: norecurse nounwind readnone
+define <8 x i16> @splat_16_plus(<8 x i16> %v, i16 signext %c) local_unnamed_addr {
+entry:
+  %0 = shl i16 %c, 8
+  %conv.i = ashr exact i16 %0, 8
+  %splat.splatinsert.i = insertelement <8 x i16> undef, i16 %conv.i, i32 0
+  %splat.splat.i = shufflevector <8 x i16> %splat.splatinsert.i, <8 x i16> undef, <8 x i32> zeroinitializer
+  %add = add <8 x i16> %splat.splat.i, %v
+  ret <8 x i16> %add
+; CHECK-LABEL: splat_16_plus
+; CHECK-NOT: xxswapd
+; CHECK: vsplth {{[0-9]+}}, {{[0-9]+}}, 3
+; CHECK: blr
+; CHECK-PWR8-LABEL: splat_16_plus
+; CHECK-PWR8-NOT: xxswapd
+; CHECK-PWR8: vsplth {{[0-9]+}}, {{[0-9]+}}, 3
+; CHECK-PWR8: blr
+}
+
+; 16 Bit Unsigned Version of the test.
+; Function Attrs: norecurse nounwind readnone
+define <8 x i16> @splat_u16_plus(<8 x i16> %v, i16 zeroext %c) local_unnamed_addr {
+entry:
+  %0 = shl i16 %c, 8
+  %conv.i = ashr exact i16 %0, 8
+  %splat.splatinsert.i = insertelement <8 x i16> undef, i16 %conv.i, i32 0
+  %splat.splat.i = shufflevector <8 x i16> %splat.splatinsert.i, <8 x i16> undef, <8 x i32> zeroinitializer
+  %add = add <8 x i16> %splat.splat.i, %v
+  ret <8 x i16> %add
+; CHECK-LABEL: splat_u16_plus
+; CHECK-NOT: xxswapd
+; CHECK: vsplth {{[0-9]+}}, {{[0-9]+}}, 3
+; CHECK: blr
+; CHECK-PWR8-LABEL: splat_u16_plus
+; CHECK-PWR8-NOT: xxswapd
+; CHECK-PWR8: vsplth {{[0-9]+}}, {{[0-9]+}}, 3
+; CHECK-PWR8: blr
+}
+
+; 32 Bit Signed Version of the test.
+; The 32 bit examples work differently than the 8 and 16 bit versions of the
+;  test. On Power 9 we have the mtvsrws instruction that does both the move to
+;  register and the splat so it does not really test the newly implemented code.
+; On Power 9 for the 32 bit case we don't need the new simplification. It is
+;  just here for completeness.
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @splat_32_plus(<4 x i32> %v, i32 signext %c) local_unnamed_addr {
+entry:
+  %sext = shl i32 %c, 24
+  %conv.i = ashr exact i32 %sext, 24
+  %splat.splatinsert.i = insertelement <4 x i32> undef, i32 %conv.i, i32 0
+  %splat.splat.i = shufflevector <4 x i32> %splat.splatinsert.i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add = add <4 x i32> %splat.splat.i, %v
+  ret <4 x i32> %add
+; CHECK-LABEL: splat_32_plus
+; CHECK-NOT: xxswapd
+; CHECK: mtvsrws {{[0-9]+}}, {{[0-9]+}}
+; CHECK: blr
+; CHECK-PWR8-LABEL: splat_32_plus
+; CHECK-PWR8-NOT: xxswapd
+; CHECK-PWR8: xxspltw {{[0-9]+}}, {{[0-9]+}}, 1
+; CHECK-PWR8: blr
+}
+
+; 32 Bit Unsigned Version of the test.
+; The 32 bit examples work differently than the 8 and 16 bit versions of the
+;  test. On Power 9 we have the mtvsrws instruction that does both the move to
+;  register and the splat so it does not really test the newly implemented code.
+; On Power 9 for the 32 bit case we don't need the new simplification. It is
+;  just here for completeness.
+; Function Attrs: norecurse nounwind readnone
+define <4 x i32> @splat_u32_plus(<4 x i32> %v, i32 zeroext %c) local_unnamed_addr {
+entry:
+  %sext = shl i32 %c, 24
+  %conv.i = ashr exact i32 %sext, 24
+  %splat.splatinsert.i = insertelement <4 x i32> undef, i32 %conv.i, i32 0
+  %splat.splat.i = shufflevector <4 x i32> %splat.splatinsert.i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %add = add <4 x i32> %splat.splat.i, %v
+  ret <4 x i32> %add
+; CHECK-LABEL: splat_u32_plus
+; CHECK-NOT: xxswapd
+; CHECK: mtvsrws {{[0-9]+}}, {{[0-9]+}}
+; CHECK: blr
+; CHECK-PWR8-LABEL: splat_u32_plus
+; CHECK-PWR8-NOT: xxswapd
+; CHECK-PWR8: xxspltw {{[0-9]+}}, {{[0-9]+}}, 1
+; CHECK-PWR8: blr
+}
+