From fe8cc25a97287603682b3a728fd9b0602ab68334 Mon Sep 17 00:00:00 2001 From: Nemanja Ivanovic Date: Fri, 24 Jul 2020 15:38:27 -0400 Subject: [PATCH] [PowerPC] Fix computation of offset for load-and-splat for permuted loads Unfortunately this is another regression from my canonicalization patch (1fed131660b2). The patch contained two implicit assumptions: 1. That we would have a permuted load only if we are loading a partial vector 2. That a partial vector load would necessarily be as wide as the splat However, assumption 2 is not correct since it is possible to do a wider load and only splat a half of it. This patch corrects this assumption by simply checking if the load is permuted and adjusting the offset if it is. --- lib/Target/PowerPC/PPCISelLowering.cpp | 26 ++++-- .../PowerPC/canonical-merge-shuffles.ll | 88 +++++++++++++++++++ 2 files changed, 106 insertions(+), 8 deletions(-) diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index edc23b2673f..c2ba7195509 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -9126,13 +9126,15 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const { Op0.getOperand(1)); } -static const SDValue *getNormalLoadInput(const SDValue &Op) { +static const SDValue *getNormalLoadInput(const SDValue &Op, bool &IsPermuted) { const SDValue *InputLoad = &Op; if (InputLoad->getOpcode() == ISD::BITCAST) InputLoad = &InputLoad->getOperand(0); if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR || - InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) + InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) { + IsPermuted = InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED; InputLoad = &InputLoad->getOperand(0); + } if (InputLoad->getOpcode() != ISD::LOAD) return nullptr; LoadSDNode *LD = cast(*InputLoad); @@ -9304,7 +9306,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op, if (!BVNIsConstantSplat || SplatBitSize > 32) { - const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0)); + bool IsPermutedLoad = false; + const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0), IsPermutedLoad); // Handle load-and-splat patterns as we have instructions that will do this // in one go. if (InputLoad && DAG.isSplatValue(Op, true)) { @@ -9927,7 +9930,8 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, // If this is a load-and-splat, we can do that with a single instruction // in some cases. However if the load has multiple uses, we don't want to // combine it because that will just produce multiple loads. - const SDValue *InputLoad = getNormalLoadInput(V1); + bool IsPermutedLoad = false; + const SDValue *InputLoad = getNormalLoadInput(V1, IsPermutedLoad); if (InputLoad && Subtarget.hasVSX() && V2.isUndef() && (PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) && InputLoad->hasOneUse()) { @@ -9935,6 +9939,16 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, int SplatIdx = PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG); + // The splat index for permuted loads will be in the left half of the vector + // which is strictly wider than the loaded value by 8 bytes. So we need to + // adjust the splat index to point to the correct address in memory. + if (IsPermutedLoad) { + assert(isLittleEndian && "Unexpected permuted load on big endian target"); + SplatIdx += IsFourByte ? 2 : 1; + assert(SplatIdx < IsFourByte ? 4 : 2 && + "Splat of a value outside of the loaded memory"); + } + LoadSDNode *LD = cast(*InputLoad); // For 4-byte load-and-splat, we need Power9. if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) { @@ -9944,10 +9958,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, else Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8; - // If we are loading a partial vector, it does not make sense to adjust - // the base pointer. This happens with (splat (s_to_v_permuted (ld))). - if (LD->getMemoryVT().getSizeInBits() == (IsFourByte ? 32 : 64)) - Offset = 0; SDValue BasePtr = LD->getBasePtr(); if (Offset != 0) BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), diff --git a/test/CodeGen/PowerPC/canonical-merge-shuffles.ll b/test/CodeGen/PowerPC/canonical-merge-shuffles.ll index 11bc2bae987..cdd04b33318 100644 --- a/test/CodeGen/PowerPC/canonical-merge-shuffles.ll +++ b/test/CodeGen/PowerPC/canonical-merge-shuffles.ll @@ -446,5 +446,93 @@ entry: ret <16 x i8> %shuffle } +define dso_local <4 x i32> @testSplat4Low(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 { +; CHECK-P8-LABEL: testSplat4Low: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: ld r3, 0(r3) +; CHECK-P8-NEXT: mtfprd f0, r3 +; CHECK-P8-NEXT: xxspltw v2, vs0, 0 +; CHECK-P8-NEXT: blr +; +; CHECK-P9-LABEL: testSplat4Low: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: addi r3, r3, 4 +; CHECK-P9-NEXT: lxvwsx v2, 0, r3 +; CHECK-P9-NEXT: blr +; +; CHECK-NOVSX-LABEL: testSplat4Low: +; CHECK-NOVSX: # %bb.0: # %entry +; CHECK-NOVSX-NEXT: ld r3, 0(r3) +; CHECK-NOVSX-NEXT: addi r4, r1, -16 +; CHECK-NOVSX-NEXT: std r3, -16(r1) +; CHECK-NOVSX-NEXT: lvx v2, 0, r4 +; CHECK-NOVSX-NEXT: vspltw v2, v2, 2 +; CHECK-NOVSX-NEXT: blr +entry: + %0 = load <8 x i8>, <8 x i8>* %ptr, align 8 + %vecinit18 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> + %1 = bitcast <16 x i8> %vecinit18 to <4 x i32> + ret <4 x i32> %1 +} + +; Function Attrs: norecurse nounwind readonly +define dso_local <4 x i32> @testSplat4hi(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 { +; CHECK-P8-LABEL: testSplat4hi: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: ld r3, 0(r3) +; CHECK-P8-NEXT: mtfprd f0, r3 +; CHECK-P8-NEXT: xxspltw v2, vs0, 1 +; CHECK-P8-NEXT: blr +; +; CHECK-P9-LABEL: testSplat4hi: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: lxvwsx v2, 0, r3 +; CHECK-P9-NEXT: blr +; +; CHECK-NOVSX-LABEL: testSplat4hi: +; CHECK-NOVSX: # %bb.0: # %entry +; CHECK-NOVSX-NEXT: ld r3, 0(r3) +; CHECK-NOVSX-NEXT: addi r4, r1, -16 +; CHECK-NOVSX-NEXT: std r3, -16(r1) +; CHECK-NOVSX-NEXT: lvx v2, 0, r4 +; CHECK-NOVSX-NEXT: vspltw v2, v2, 3 +; CHECK-NOVSX-NEXT: blr +entry: + %0 = load <8 x i8>, <8 x i8>* %ptr, align 8 + %vecinit22 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> + %1 = bitcast <16 x i8> %vecinit22 to <4 x i32> + ret <4 x i32> %1 +} + +; Function Attrs: norecurse nounwind readonly +define dso_local <2 x i64> @testSplat8(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 { +; CHECK-P8-LABEL: testSplat8: +; CHECK-P8: # %bb.0: # %entry +; CHECK-P8-NEXT: lxvdsx v2, 0, r3 +; CHECK-P8-NEXT: blr +; +; CHECK-P9-LABEL: testSplat8: +; CHECK-P9: # %bb.0: # %entry +; CHECK-P9-NEXT: lxvdsx v2, 0, r3 +; CHECK-P9-NEXT: blr +; +; CHECK-NOVSX-LABEL: testSplat8: +; CHECK-NOVSX: # %bb.0: # %entry +; CHECK-NOVSX-NEXT: ld r3, 0(r3) +; CHECK-NOVSX-NEXT: addis r4, r2, .LCPI19_0@toc@ha +; CHECK-NOVSX-NEXT: addi r4, r4, .LCPI19_0@toc@l +; CHECK-NOVSX-NEXT: lvx v2, 0, r4 +; CHECK-NOVSX-NEXT: std r3, -16(r1) +; CHECK-NOVSX-NEXT: addi r3, r1, -16 +; CHECK-NOVSX-NEXT: lvx v3, 0, r3 +; CHECK-NOVSX-NEXT: vperm v2, v3, v3, v2 +; CHECK-NOVSX-NEXT: blr +entry: + %0 = load <8 x i8>, <8 x i8>* %ptr, align 8 + %vecinit30 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> + %1 = bitcast <16 x i8> %vecinit30 to <2 x i64> + ret <2 x i64> %1 +} + declare double @dummy() local_unnamed_addr attributes #0 = { nounwind }