mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 03:02:36 +01:00
[x86] avoid 256-bit andnp that requires insert/extract with AVX1 (PR37449)
This is the final (I hope!) problem pattern mentioned in PR37749: https://bugs.llvm.org/show_bug.cgi?id=37749 We are trying to avoid an AVX1 sinkhole caused by having 256-bit bitwise logic ops but no other 256-bit integer ops. We've already solved the simple logic ops, but 'andn' is an x86 special. I looked at alternative solutions like extending the generic DAG combine or trying to wait until the ANDNP node is created, but those are bigger patches that can over-reach. Ie, splitting to 128-bit does not look like a win in most cases with >1 256-bit op. The pattern matching is cluttered with bitcasts because of our i64 element canonicalization. For the affected test, we have this vector-type-legalized sequence: t29: v8i32 = concat_vectors t27, t28 t30: v4i64 = bitcast t29 t18: v8i32 = BUILD_VECTOR Constant:i32<-1>, Constant:i32<-1>, ... t31: v4i64 = bitcast t18 t32: v4i64 = xor t30, t31 t9: v8i32 = BUILD_VECTOR Constant:i32<255>, Constant:i32<255>, ... t34: v4i64 = bitcast t9 t35: v4i64 = and t32, t34 t36: v8i32 = bitcast t35 t37: v4i32 = extract_subvector t36, Constant:i64<0> t38: v4i32 = extract_subvector t36, Constant:i64<4> Differential Revision: https://reviews.llvm.org/D52318 llvm-svn: 343008
This commit is contained in:
parent
da8773b8d1
commit
50d6ec057c
@ -8205,7 +8205,7 @@ SDValue llvm::peekThroughOneUseBitcasts(SDValue V) {
|
||||
bool llvm::isBitwiseNot(SDValue V) {
|
||||
if (V.getOpcode() != ISD::XOR)
|
||||
return false;
|
||||
ConstantSDNode *C = isConstOrConstSplat(V.getOperand(1));
|
||||
ConstantSDNode *C = isConstOrConstSplat(peekThroughBitcasts(V.getOperand(1)));
|
||||
return C && C->isAllOnesValue();
|
||||
}
|
||||
|
||||
|
@ -40168,6 +40168,37 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
|
||||
static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
|
||||
TargetLowering::DAGCombinerInfo &DCI,
|
||||
const X86Subtarget &Subtarget) {
|
||||
// For AVX1 only, if we are extracting from a 256-bit and+not (which will
|
||||
// eventually get combined/lowered into ANDNP) with a concatenated operand,
|
||||
// split the 'and' into 128-bit ops to avoid the concatenate and extract.
|
||||
// We let generic combining take over from there to simplify the
|
||||
// insert/extract and 'not'.
|
||||
// This pattern emerges during AVX1 legalization. We handle it before lowering
|
||||
// to avoid complications like splitting constant vector loads.
|
||||
|
||||
// Capture the original wide type in the likely case that we need to bitcast
|
||||
// back to this type.
|
||||
EVT VT = N->getValueType(0);
|
||||
EVT WideVecVT = N->getOperand(0).getValueType();
|
||||
SDValue WideVec = peekThroughBitcasts(N->getOperand(0));
|
||||
if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && WideVecVT.isSimple() &&
|
||||
WideVecVT.getSizeInBits() == 256 && WideVec.getOpcode() == ISD::AND) {
|
||||
auto isConcatenatedNot = [] (SDValue V) {
|
||||
V = peekThroughBitcasts(V);
|
||||
if (!isBitwiseNot(V))
|
||||
return false;
|
||||
SDValue NotOp = V->getOperand(0);
|
||||
return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS;
|
||||
};
|
||||
if (isConcatenatedNot(WideVec.getOperand(0)) ||
|
||||
isConcatenatedNot(WideVec.getOperand(1))) {
|
||||
// extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
|
||||
SDValue Concat = split256IntArith(WideVec, DAG);
|
||||
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
|
||||
DAG.getBitcast(WideVecVT, Concat), N->getOperand(1));
|
||||
}
|
||||
}
|
||||
|
||||
if (DCI.isBeforeLegalizeOps())
|
||||
return SDValue();
|
||||
|
||||
|
@ -342,9 +342,9 @@ define <8 x i32> @andn_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
|
||||
; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm3
|
||||
; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vandnps {{.*}}(%rip), %ymm0, %ymm0
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [1095216660735,1095216660735]
|
||||
; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpandn %xmm1, %xmm3, %xmm1
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
|
||||
; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
|
||||
@ -364,6 +364,8 @@ define <8 x i32> @andn_disguised_i8_elts(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
|
||||
ret <8 x i32> %add1
|
||||
}
|
||||
|
||||
; Negative test - if we don't have a leading concat_vectors, the transform won't be profitable.
|
||||
|
||||
define <8 x i32> @andn_variable_mask_operand_no_concat(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
|
||||
; AVX1-LABEL: andn_variable_mask_operand_no_concat:
|
||||
; AVX1: # %bb.0:
|
||||
@ -386,6 +388,8 @@ define <8 x i32> @andn_variable_mask_operand_no_concat(<8 x i32> %x, <8 x i32> %
|
||||
ret <8 x i32> %add
|
||||
}
|
||||
|
||||
; Negative test - if we don't have a leading concat_vectors, the transform won't be profitable (even if the mask is a constant).
|
||||
|
||||
define <8 x i32> @andn_constant_mask_operand_no_concat(<8 x i32> %x, <8 x i32> %y) {
|
||||
; AVX1-LABEL: andn_constant_mask_operand_no_concat:
|
||||
; AVX1: # %bb.0:
|
||||
@ -408,6 +412,8 @@ define <8 x i32> @andn_constant_mask_operand_no_concat(<8 x i32> %x, <8 x i32> %
|
||||
ret <8 x i32> %r
|
||||
}
|
||||
|
||||
; This is a close call, but we split the 'andn' to reduce the insert/extract.
|
||||
|
||||
define <8 x i32> @andn_variable_mask_operand_concat(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z, <8 x i32> %w) {
|
||||
; AVX1-LABEL: andn_variable_mask_operand_concat:
|
||||
; AVX1: # %bb.0:
|
||||
@ -415,9 +421,9 @@ define <8 x i32> @andn_variable_mask_operand_concat(<8 x i32> %x, <8 x i32> %y,
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
|
||||
; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4
|
||||
; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vandnps %ymm2, %ymm0, %ymm0
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
|
||||
; AVX1-NEXT: vpandn %xmm2, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
|
||||
; AVX1-NEXT: vpandn %xmm1, %xmm4, %xmm1
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
|
||||
; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpaddd %xmm3, %xmm0, %xmm0
|
||||
|
Loading…
Reference in New Issue
Block a user