From fe8cc25a97287603682b3a728fd9b0602ab68334 Mon Sep 17 00:00:00 2001
From: Nemanja Ivanovic <nemanja.i.ibm@gmail.com>
Date: Fri, 24 Jul 2020 15:38:27 -0400
Subject: [PATCH] [PowerPC] Fix computation of offset for load-and-splat for
 permuted loads

Unfortunately this is another regression from my canonicalization patch
(1fed131660b2). The patch contained two implicit assumptions:
1. That we would have a permuted load only if we are loading a partial vector
2. That a partial vector load would necessarily be as wide as the splat

However, assumption 2 is not correct since it is possible to do a wider
load and only splat a half of it. This patch corrects this assumption by
simply checking if the load is permuted and adjusting the offset if it is.
---
 lib/Target/PowerPC/PPCISelLowering.cpp        | 26 ++++--
 .../PowerPC/canonical-merge-shuffles.ll       | 88 +++++++++++++++++++
 2 files changed, 106 insertions(+), 8 deletions(-)
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index edc23b2673f..c2ba7195509 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -9126,13 +9126,15 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
                      Op0.getOperand(1));
 }
 
-static const SDValue *getNormalLoadInput(const SDValue &Op) {
+static const SDValue *getNormalLoadInput(const SDValue &Op, bool &IsPermuted) {
   const SDValue *InputLoad = &Op;
   if (InputLoad->getOpcode() == ISD::BITCAST)
     InputLoad = &InputLoad->getOperand(0);
   if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR ||
-      InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED)
+      InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) {
+    IsPermuted = InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED;
     InputLoad = &InputLoad->getOperand(0);
+  }
   if (InputLoad->getOpcode() != ISD::LOAD)
     return nullptr;
   LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
@@ -9304,7 +9306,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
 
   if (!BVNIsConstantSplat || SplatBitSize > 32) {
 
-    const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0));
+    bool IsPermutedLoad = false;
+    const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0), IsPermutedLoad);
     // Handle load-and-splat patterns as we have instructions that will do this
     // in one go.
     if (InputLoad && DAG.isSplatValue(Op, true)) {
@@ -9927,7 +9930,8 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   // If this is a load-and-splat, we can do that with a single instruction
   // in some cases. However if the load has multiple uses, we don't want to
   // combine it because that will just produce multiple loads.
-  const SDValue *InputLoad = getNormalLoadInput(V1);
+  bool IsPermutedLoad = false;
+  const SDValue *InputLoad = getNormalLoadInput(V1, IsPermutedLoad);
   if (InputLoad && Subtarget.hasVSX() && V2.isUndef() &&
       (PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) &&
       InputLoad->hasOneUse()) {
@@ -9935,6 +9939,16 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     int SplatIdx =
       PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG);
 
+    // The splat index for permuted loads will be in the left half of the vector
+    // which is strictly wider than the loaded value by 8 bytes. So we need to
+    // adjust the splat index to point to the correct address in memory.
+    if (IsPermutedLoad) {
+      assert(isLittleEndian && "Unexpected permuted load on big endian target");
+      SplatIdx += IsFourByte ? 2 : 1;
+      assert(SplatIdx < IsFourByte ? 4 : 2 &&
+             "Splat of a value outside of the loaded memory");
+    }
+
     LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
     // For 4-byte load-and-splat, we need Power9.
     if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) {
@@ -9944,10 +9958,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
       else
         Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;
 
-      // If we are loading a partial vector, it does not make sense to adjust
-      // the base pointer. This happens with (splat (s_to_v_permuted (ld))).
-      if (LD->getMemoryVT().getSizeInBits() == (IsFourByte ? 32 : 64))
-        Offset = 0;
       SDValue BasePtr = LD->getBasePtr();
       if (Offset != 0)
         BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
diff --git a/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
index 11bc2bae987..cdd04b33318 100644
--- a/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
+++ b/test/CodeGen/PowerPC/canonical-merge-shuffles.ll
@@ -446,5 +446,93 @@ entry:
   ret <16 x i8> %shuffle
 }
 
+define dso_local <4 x i32> @testSplat4Low(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: testSplat4Low:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    ld r3, 0(r3)
+; CHECK-P8-NEXT:    mtfprd f0, r3
+; CHECK-P8-NEXT:    xxspltw v2, vs0, 0
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: testSplat4Low:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    addi r3, r3, 4
+; CHECK-P9-NEXT:    lxvwsx v2, 0, r3
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-NOVSX-LABEL: testSplat4Low:
+; CHECK-NOVSX:       # %bb.0: # %entry
+; CHECK-NOVSX-NEXT:    ld r3, 0(r3)
+; CHECK-NOVSX-NEXT:    addi r4, r1, -16
+; CHECK-NOVSX-NEXT:    std r3, -16(r1)
+; CHECK-NOVSX-NEXT:    lvx v2, 0, r4
+; CHECK-NOVSX-NEXT:    vspltw v2, v2, 2
+; CHECK-NOVSX-NEXT:    blr
+entry:
+  %0 = load <8 x i8>, <8 x i8>* %ptr, align 8
+  %vecinit18 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %1 = bitcast <16 x i8> %vecinit18 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+; Function Attrs: norecurse nounwind readonly
+define dso_local <4 x i32> @testSplat4hi(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: testSplat4hi:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    ld r3, 0(r3)
+; CHECK-P8-NEXT:    mtfprd f0, r3
+; CHECK-P8-NEXT:    xxspltw v2, vs0, 1
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: testSplat4hi:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxvwsx v2, 0, r3
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-NOVSX-LABEL: testSplat4hi:
+; CHECK-NOVSX:       # %bb.0: # %entry
+; CHECK-NOVSX-NEXT:    ld r3, 0(r3)
+; CHECK-NOVSX-NEXT:    addi r4, r1, -16
+; CHECK-NOVSX-NEXT:    std r3, -16(r1)
+; CHECK-NOVSX-NEXT:    lvx v2, 0, r4
+; CHECK-NOVSX-NEXT:    vspltw v2, v2, 3
+; CHECK-NOVSX-NEXT:    blr
+entry:
+  %0 = load <8 x i8>, <8 x i8>* %ptr, align 8
+  %vecinit22 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %1 = bitcast <16 x i8> %vecinit22 to <4 x i32>
+  ret <4 x i32> %1
+}
+
+; Function Attrs: norecurse nounwind readonly
+define dso_local <2 x i64> @testSplat8(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 {
+; CHECK-P8-LABEL: testSplat8:
+; CHECK-P8:       # %bb.0: # %entry
+; CHECK-P8-NEXT:    lxvdsx v2, 0, r3
+; CHECK-P8-NEXT:    blr
+;
+; CHECK-P9-LABEL: testSplat8:
+; CHECK-P9:       # %bb.0: # %entry
+; CHECK-P9-NEXT:    lxvdsx v2, 0, r3
+; CHECK-P9-NEXT:    blr
+;
+; CHECK-NOVSX-LABEL: testSplat8:
+; CHECK-NOVSX:       # %bb.0: # %entry
+; CHECK-NOVSX-NEXT:    ld r3, 0(r3)
+; CHECK-NOVSX-NEXT:    addis r4, r2, .LCPI19_0@toc@ha
+; CHECK-NOVSX-NEXT:    addi r4, r4, .LCPI19_0@toc@l
+; CHECK-NOVSX-NEXT:    lvx v2, 0, r4
+; CHECK-NOVSX-NEXT:    std r3, -16(r1)
+; CHECK-NOVSX-NEXT:    addi r3, r1, -16
+; CHECK-NOVSX-NEXT:    lvx v3, 0, r3
+; CHECK-NOVSX-NEXT:    vperm v2, v3, v3, v2
+; CHECK-NOVSX-NEXT:    blr
+entry:
+  %0 = load <8 x i8>, <8 x i8>* %ptr, align 8
+  %vecinit30 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %1 = bitcast <16 x i8> %vecinit30 to <2 x i64>
+  ret <2 x i64> %1
+}
+
 declare double @dummy() local_unnamed_addr
 attributes #0 = { nounwind }