1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[PowerPC] Fix computation of offset for load-and-splat for permuted loads

Unfortunately this is another regression from my canonicalization patch
(1fed131660b2). The patch contained two implicit assumptions:
1. That we would have a permuted load only if we are loading a partial vector
2. That a partial vector load would necessarily be as wide as the splat

However, assumption 2 is not correct since it is possible to do a wider
load and only splat a half of it. This patch corrects this assumption by
simply checking if the load is permuted and adjusting the offset if it is.
This commit is contained in:
Nemanja Ivanovic 2020-07-24 15:38:27 -04:00
parent 1d7926a981
commit fe8cc25a97
2 changed files with 106 additions and 8 deletions

View File

@ -9126,13 +9126,15 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
Op0.getOperand(1));
}
static const SDValue *getNormalLoadInput(const SDValue &Op) {
static const SDValue *getNormalLoadInput(const SDValue &Op, bool &IsPermuted) {
const SDValue *InputLoad = &Op;
if (InputLoad->getOpcode() == ISD::BITCAST)
InputLoad = &InputLoad->getOperand(0);
if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR ||
InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED)
InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED) {
IsPermuted = InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED;
InputLoad = &InputLoad->getOperand(0);
}
if (InputLoad->getOpcode() != ISD::LOAD)
return nullptr;
LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
@ -9304,7 +9306,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
if (!BVNIsConstantSplat || SplatBitSize > 32) {
const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0));
bool IsPermutedLoad = false;
const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0), IsPermutedLoad);
// Handle load-and-splat patterns as we have instructions that will do this
// in one go.
if (InputLoad && DAG.isSplatValue(Op, true)) {
@ -9927,7 +9930,8 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
// If this is a load-and-splat, we can do that with a single instruction
// in some cases. However if the load has multiple uses, we don't want to
// combine it because that will just produce multiple loads.
const SDValue *InputLoad = getNormalLoadInput(V1);
bool IsPermutedLoad = false;
const SDValue *InputLoad = getNormalLoadInput(V1, IsPermutedLoad);
if (InputLoad && Subtarget.hasVSX() && V2.isUndef() &&
(PPC::isSplatShuffleMask(SVOp, 4) || PPC::isSplatShuffleMask(SVOp, 8)) &&
InputLoad->hasOneUse()) {
@ -9935,6 +9939,16 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
int SplatIdx =
PPC::getSplatIdxForPPCMnemonics(SVOp, IsFourByte ? 4 : 8, DAG);
// The splat index for permuted loads will be in the left half of the vector
// which is strictly wider than the loaded value by 8 bytes. So we need to
// adjust the splat index to point to the correct address in memory.
if (IsPermutedLoad) {
assert(isLittleEndian && "Unexpected permuted load on big endian target");
SplatIdx += IsFourByte ? 2 : 1;
assert(SplatIdx < IsFourByte ? 4 : 2 &&
"Splat of a value outside of the loaded memory");
}
LoadSDNode *LD = cast<LoadSDNode>(*InputLoad);
// For 4-byte load-and-splat, we need Power9.
if ((IsFourByte && Subtarget.hasP9Vector()) || !IsFourByte) {
@ -9944,10 +9958,6 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
else
Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;
// If we are loading a partial vector, it does not make sense to adjust
// the base pointer. This happens with (splat (s_to_v_permuted (ld))).
if (LD->getMemoryVT().getSizeInBits() == (IsFourByte ? 32 : 64))
Offset = 0;
SDValue BasePtr = LD->getBasePtr();
if (Offset != 0)
BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),

View File

@ -446,5 +446,93 @@ entry:
ret <16 x i8> %shuffle
}
define dso_local <4 x i32> @testSplat4Low(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 {
; CHECK-P8-LABEL: testSplat4Low:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: ld r3, 0(r3)
; CHECK-P8-NEXT: mtfprd f0, r3
; CHECK-P8-NEXT: xxspltw v2, vs0, 0
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: testSplat4Low:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: addi r3, r3, 4
; CHECK-P9-NEXT: lxvwsx v2, 0, r3
; CHECK-P9-NEXT: blr
;
; CHECK-NOVSX-LABEL: testSplat4Low:
; CHECK-NOVSX: # %bb.0: # %entry
; CHECK-NOVSX-NEXT: ld r3, 0(r3)
; CHECK-NOVSX-NEXT: addi r4, r1, -16
; CHECK-NOVSX-NEXT: std r3, -16(r1)
; CHECK-NOVSX-NEXT: lvx v2, 0, r4
; CHECK-NOVSX-NEXT: vspltw v2, v2, 2
; CHECK-NOVSX-NEXT: blr
entry:
%0 = load <8 x i8>, <8 x i8>* %ptr, align 8
%vecinit18 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%1 = bitcast <16 x i8> %vecinit18 to <4 x i32>
ret <4 x i32> %1
}
; Function Attrs: norecurse nounwind readonly
define dso_local <4 x i32> @testSplat4hi(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 {
; CHECK-P8-LABEL: testSplat4hi:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: ld r3, 0(r3)
; CHECK-P8-NEXT: mtfprd f0, r3
; CHECK-P8-NEXT: xxspltw v2, vs0, 1
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: testSplat4hi:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxvwsx v2, 0, r3
; CHECK-P9-NEXT: blr
;
; CHECK-NOVSX-LABEL: testSplat4hi:
; CHECK-NOVSX: # %bb.0: # %entry
; CHECK-NOVSX-NEXT: ld r3, 0(r3)
; CHECK-NOVSX-NEXT: addi r4, r1, -16
; CHECK-NOVSX-NEXT: std r3, -16(r1)
; CHECK-NOVSX-NEXT: lvx v2, 0, r4
; CHECK-NOVSX-NEXT: vspltw v2, v2, 3
; CHECK-NOVSX-NEXT: blr
entry:
%0 = load <8 x i8>, <8 x i8>* %ptr, align 8
%vecinit22 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%1 = bitcast <16 x i8> %vecinit22 to <4 x i32>
ret <4 x i32> %1
}
; Function Attrs: norecurse nounwind readonly
define dso_local <2 x i64> @testSplat8(<8 x i8>* nocapture readonly %ptr) local_unnamed_addr #0 {
; CHECK-P8-LABEL: testSplat8:
; CHECK-P8: # %bb.0: # %entry
; CHECK-P8-NEXT: lxvdsx v2, 0, r3
; CHECK-P8-NEXT: blr
;
; CHECK-P9-LABEL: testSplat8:
; CHECK-P9: # %bb.0: # %entry
; CHECK-P9-NEXT: lxvdsx v2, 0, r3
; CHECK-P9-NEXT: blr
;
; CHECK-NOVSX-LABEL: testSplat8:
; CHECK-NOVSX: # %bb.0: # %entry
; CHECK-NOVSX-NEXT: ld r3, 0(r3)
; CHECK-NOVSX-NEXT: addis r4, r2, .LCPI19_0@toc@ha
; CHECK-NOVSX-NEXT: addi r4, r4, .LCPI19_0@toc@l
; CHECK-NOVSX-NEXT: lvx v2, 0, r4
; CHECK-NOVSX-NEXT: std r3, -16(r1)
; CHECK-NOVSX-NEXT: addi r3, r1, -16
; CHECK-NOVSX-NEXT: lvx v3, 0, r3
; CHECK-NOVSX-NEXT: vperm v2, v3, v3, v2
; CHECK-NOVSX-NEXT: blr
entry:
%0 = load <8 x i8>, <8 x i8>* %ptr, align 8
%vecinit30 = shufflevector <8 x i8> %0, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%1 = bitcast <16 x i8> %vecinit30 to <2 x i64>
ret <2 x i64> %1
}
declare double @dummy() local_unnamed_addr
attributes #0 = { nounwind }