mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-10-19 11:02:59 +02:00
[X86] In combineLoopMAddPattern and combineLoopSADPattern, preserve the vector reduction flag on the final add. Handle unrolled loops by letting DAG combine revisit.
This reverts r340478 and r340631 and replaces them with a simpler method of just letting DAG combine revisit the nodes to handle the other operand. llvm-svn: 367195
This commit is contained in:
parent
81938dd934
commit
ea59641915
@ -43151,8 +43151,17 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
|
||||
if (!Subtarget.hasSSE2())
|
||||
return SDValue();
|
||||
|
||||
SDValue Op0 = N->getOperand(0);
|
||||
SDValue Op1 = N->getOperand(1);
|
||||
SDValue MulOp = N->getOperand(0);
|
||||
SDValue OtherOp = N->getOperand(1);
|
||||
|
||||
if (MulOp.getOpcode() != ISD::MUL)
|
||||
std::swap(MulOp, OtherOp);
|
||||
if (MulOp.getOpcode() != ISD::MUL)
|
||||
return SDValue();
|
||||
|
||||
ShrinkMode Mode;
|
||||
if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
|
||||
return SDValue();
|
||||
|
||||
EVT VT = N->getValueType(0);
|
||||
|
||||
@ -43161,49 +43170,33 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
|
||||
if (!VT.isVector() || VT.getVectorNumElements() < 8)
|
||||
return SDValue();
|
||||
|
||||
if (Op0.getOpcode() != ISD::MUL)
|
||||
std::swap(Op0, Op1);
|
||||
if (Op0.getOpcode() != ISD::MUL)
|
||||
return SDValue();
|
||||
|
||||
ShrinkMode Mode;
|
||||
if (!canReduceVMulWidth(Op0.getNode(), DAG, Mode) || Mode == MULU16)
|
||||
return SDValue();
|
||||
|
||||
SDLoc DL(N);
|
||||
EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
|
||||
VT.getVectorNumElements());
|
||||
EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
|
||||
VT.getVectorNumElements() / 2);
|
||||
|
||||
// Shrink the operands of mul.
|
||||
SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
|
||||
SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
|
||||
|
||||
// Madd vector size is half of the original vector size
|
||||
auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
|
||||
ArrayRef<SDValue> Ops) {
|
||||
MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
|
||||
return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
|
||||
};
|
||||
SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
|
||||
PMADDWDBuilder);
|
||||
// Fill the rest of the output with 0
|
||||
SDValue Zero = DAG.getConstant(0, DL, Madd.getSimpleValueType());
|
||||
SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
|
||||
|
||||
auto BuildPMADDWD = [&](SDValue Mul) {
|
||||
// Shrink the operands of mul.
|
||||
SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(0));
|
||||
SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, Mul.getOperand(1));
|
||||
|
||||
SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
|
||||
PMADDWDBuilder);
|
||||
// Fill the rest of the output with 0
|
||||
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd,
|
||||
DAG.getConstant(0, DL, MAddVT));
|
||||
};
|
||||
|
||||
Op0 = BuildPMADDWD(Op0);
|
||||
|
||||
// It's possible that Op1 is also a mul we can reduce.
|
||||
if (Op1.getOpcode() == ISD::MUL &&
|
||||
canReduceVMulWidth(Op1.getNode(), DAG, Mode) && Mode != MULU16) {
|
||||
Op1 = BuildPMADDWD(Op1);
|
||||
}
|
||||
|
||||
return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
|
||||
// Preserve the reduction flag on the ADD. We may need to revisit for the
|
||||
// other operand.
|
||||
SDNodeFlags Flags;
|
||||
Flags.setVectorReduction(true);
|
||||
return DAG.getNode(ISD::ADD, DL, VT, Concat, OtherOp, Flags);
|
||||
}
|
||||
|
||||
static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
|
||||
@ -43213,8 +43206,6 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
|
||||
|
||||
SDLoc DL(N);
|
||||
EVT VT = N->getValueType(0);
|
||||
SDValue Op0 = N->getOperand(0);
|
||||
SDValue Op1 = N->getOperand(1);
|
||||
|
||||
// TODO: There's nothing special about i32, any integer type above i16 should
|
||||
// work just as well.
|
||||
@ -43234,55 +43225,49 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
|
||||
if (VT.getSizeInBits() / 4 > RegSize)
|
||||
return SDValue();
|
||||
|
||||
// We know N is a reduction add, which means one of its operands is a phi.
|
||||
// To match SAD, we need the other operand to be a ABS.
|
||||
if (Op0.getOpcode() != ISD::ABS)
|
||||
std::swap(Op0, Op1);
|
||||
if (Op0.getOpcode() != ISD::ABS)
|
||||
// We know N is a reduction add. To match SAD, we need one of the operands to
|
||||
// be an ABS.
|
||||
SDValue AbsOp = N->getOperand(0);
|
||||
SDValue OtherOp = N->getOperand(1);
|
||||
if (AbsOp.getOpcode() != ISD::ABS)
|
||||
std::swap(AbsOp, OtherOp);
|
||||
if (AbsOp.getOpcode() != ISD::ABS)
|
||||
return SDValue();
|
||||
|
||||
auto BuildPSADBW = [&](SDValue Op0, SDValue Op1) {
|
||||
// SAD pattern detected. Now build a SAD instruction and an addition for
|
||||
// reduction. Note that the number of elements of the result of SAD is less
|
||||
// than the number of elements of its input. Therefore, we could only update
|
||||
// part of elements in the reduction vector.
|
||||
SDValue Sad = createPSADBW(DAG, Op0, Op1, DL, Subtarget);
|
||||
|
||||
// The output of PSADBW is a vector of i64.
|
||||
// We need to turn the vector of i64 into a vector of i32.
|
||||
// If the reduction vector is at least as wide as the psadbw result, just
|
||||
// bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
|
||||
// anyway.
|
||||
MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
|
||||
if (VT.getSizeInBits() >= ResVT.getSizeInBits())
|
||||
Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
|
||||
else
|
||||
Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
|
||||
|
||||
if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
|
||||
// Fill the upper elements with zero to match the add width.
|
||||
SDValue Zero = DAG.getConstant(0, DL, VT);
|
||||
Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
|
||||
DAG.getIntPtrConstant(0, DL));
|
||||
}
|
||||
|
||||
return Sad;
|
||||
};
|
||||
|
||||
// Check whether we have an abs-diff pattern feeding into the select.
|
||||
SDValue SadOp0, SadOp1;
|
||||
if (!detectZextAbsDiff(Op0, SadOp0, SadOp1))
|
||||
if(!detectZextAbsDiff(AbsOp, SadOp0, SadOp1))
|
||||
return SDValue();
|
||||
|
||||
Op0 = BuildPSADBW(SadOp0, SadOp1);
|
||||
// SAD pattern detected. Now build a SAD instruction and an addition for
|
||||
// reduction. Note that the number of elements of the result of SAD is less
|
||||
// than the number of elements of its input. Therefore, we could only update
|
||||
// part of elements in the reduction vector.
|
||||
SDValue Sad = createPSADBW(DAG, SadOp0, SadOp1, DL, Subtarget);
|
||||
|
||||
// It's possible we have a sad on the other side too.
|
||||
if (Op1.getOpcode() == ISD::ABS &&
|
||||
detectZextAbsDiff(Op1, SadOp0, SadOp1)) {
|
||||
Op1 = BuildPSADBW(SadOp0, SadOp1);
|
||||
// The output of PSADBW is a vector of i64.
|
||||
// We need to turn the vector of i64 into a vector of i32.
|
||||
// If the reduction vector is at least as wide as the psadbw result, just
|
||||
// bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
|
||||
// anyway.
|
||||
MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
|
||||
if (VT.getSizeInBits() >= ResVT.getSizeInBits())
|
||||
Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
|
||||
else
|
||||
Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
|
||||
|
||||
if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
|
||||
// Fill the upper elements with zero to match the add width.
|
||||
SDValue Zero = DAG.getConstant(0, DL, VT);
|
||||
Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
|
||||
DAG.getIntPtrConstant(0, DL));
|
||||
}
|
||||
|
||||
return DAG.getNode(ISD::ADD, DL, VT, Op0, Op1);
|
||||
// Preserve the reduction flag on the ADD. We may need to revisit for the
|
||||
// other operand.
|
||||
SDNodeFlags Flags;
|
||||
Flags.setVectorReduction(true);
|
||||
return DAG.getNode(ISD::ADD, DL, VT, Sad, OtherOp, Flags);
|
||||
}
|
||||
|
||||
/// Convert vector increment or decrement to sub/add with an all-ones constant:
|
||||
|
@ -2677,9 +2677,9 @@ define i32 @madd_double_reduction(<8 x i16>* %arg, <8 x i16>* %arg1, <8 x i16>*
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vmovdqu (%rdi), %xmm0
|
||||
; AVX-NEXT: vmovdqu (%rdx), %xmm1
|
||||
; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0
|
||||
; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1
|
||||
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
|
||||
; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0
|
||||
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
|
||||
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
|
@ -1403,18 +1403,18 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x
|
||||
; SSE2-NEXT: movdqu (%rdi), %xmm0
|
||||
; SSE2-NEXT: movdqu (%rsi), %xmm1
|
||||
; SSE2-NEXT: psadbw %xmm0, %xmm1
|
||||
; SSE2-NEXT: movdqu (%rdx), %xmm0
|
||||
; SSE2-NEXT: movdqu (%rcx), %xmm2
|
||||
; SSE2-NEXT: psadbw %xmm0, %xmm2
|
||||
; SSE2-NEXT: movl $1, %eax
|
||||
; SSE2-NEXT: movd %eax, %xmm0
|
||||
; SSE2-NEXT: paddd %xmm2, %xmm0
|
||||
; SSE2-NEXT: paddd %xmm1, %xmm0
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
|
||||
; SSE2-NEXT: movdqu (%rdx), %xmm2
|
||||
; SSE2-NEXT: movdqu (%rcx), %xmm3
|
||||
; SSE2-NEXT: psadbw %xmm2, %xmm3
|
||||
; SSE2-NEXT: paddd %xmm0, %xmm3
|
||||
; SSE2-NEXT: paddd %xmm1, %xmm3
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
|
||||
; SSE2-NEXT: paddd %xmm3, %xmm0
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
; SSE2-NEXT: paddd %xmm0, %xmm1
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
|
||||
; SSE2-NEXT: paddd %xmm1, %xmm0
|
||||
; SSE2-NEXT: movd %xmm0, %eax
|
||||
; SSE2-NEXT: movd %xmm1, %eax
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: sad_unroll_nonzero_initial:
|
||||
@ -1442,7 +1442,7 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x
|
||||
; AVX2-NEXT: vmovd %eax, %xmm1
|
||||
; AVX2-NEXT: vmovdqu (%rdx), %xmm2
|
||||
; AVX2-NEXT: vpsadbw (%rcx), %xmm2, %xmm2
|
||||
; AVX2-NEXT: vpaddd %xmm1, %xmm2, %xmm1
|
||||
; AVX2-NEXT: vpaddd %xmm2, %xmm1, %xmm1
|
||||
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
|
||||
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
||||
@ -1455,10 +1455,10 @@ define i32 @sad_unroll_nonzero_initial(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x
|
||||
; AVX512: # %bb.0: # %bb
|
||||
; AVX512-NEXT: vmovdqu (%rdi), %xmm0
|
||||
; AVX512-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
|
||||
; AVX512-NEXT: vmovdqu (%rdx), %xmm1
|
||||
; AVX512-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
|
||||
; AVX512-NEXT: movl $1, %eax
|
||||
; AVX512-NEXT: vmovd %eax, %xmm2
|
||||
; AVX512-NEXT: vmovd %eax, %xmm1
|
||||
; AVX512-NEXT: vmovdqu (%rdx), %xmm2
|
||||
; AVX512-NEXT: vpsadbw (%rcx), %xmm2, %xmm2
|
||||
; AVX512-NEXT: vpaddd %zmm2, %zmm1, %zmm1
|
||||
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
|
||||
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
|
||||
@ -1526,9 +1526,9 @@ define i32 @sad_double_reduction(<16 x i8>* %arg, <16 x i8>* %arg1, <16 x i8>* %
|
||||
; AVX: # %bb.0: # %bb
|
||||
; AVX-NEXT: vmovdqu (%rdi), %xmm0
|
||||
; AVX-NEXT: vmovdqu (%rdx), %xmm1
|
||||
; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
|
||||
; AVX-NEXT: vpsadbw (%rcx), %xmm1, %xmm1
|
||||
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
|
||||
; AVX-NEXT: vpsadbw (%rsi), %xmm0, %xmm0
|
||||
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
|
||||
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
|
Loading…
Reference in New Issue
Block a user