1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[X86] Stop reduceMaskedLoadToScalarLoad/reduceMaskedStoreToScalarStore from creating scalar i64 load/stores in 32-bit mode

If we emit a scalar i64 load/store it will get type legalized to two i32 load/stores.

Differential Revision: https://reviews.llvm.org/D87862
This commit is contained in:
Craig Topper 2020-09-20 13:13:42 -07:00
parent b826feba37
commit aa3ccaf198
2 changed files with 34 additions and 15 deletions

View File

@ -44499,7 +44499,8 @@ static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
/// mask have already been optimized in IR, so we don't bother with those here.
static SDValue
reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
assert(ML->isUnindexed() && "Unexpected indexed masked load!");
// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
// However, some target hooks may need to be added to know when the transform
@ -44516,14 +44517,25 @@ reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
SDLoc DL(ML);
EVT VT = ML->getValueType(0);
EVT EltVT = VT.getVectorElementType();
EVT CastVT = VT;
if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
EltVT = MVT::f64;
CastVT =
EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements());
}
SDValue Load =
DAG.getLoad(EltVT, DL, ML->getChain(), Addr,
ML->getPointerInfo().getWithOffset(Offset),
Alignment, ML->getMemOperand()->getFlags());
SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru());
// Insert the loaded element into the appropriate place in the vector.
SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
ML->getPassThru(), Load, VecIndex);
SDValue Insert =
DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex);
Insert = DAG.getBitcast(VT, Insert);
return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
}
@ -44586,7 +44598,8 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
return SDValue();
if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
if (SDValue ScalarLoad =
reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget))
return ScalarLoad;
// TODO: Do some AVX512 subsets benefit from this transform?
@ -44623,7 +44636,8 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
/// mask have already been optimized in IR, so we don't bother with those here.
static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
SelectionDAG &DAG) {
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
// However, some target hooks may need to be added to know when the transform
// is profitable. Endianness would also have to be considered.
@ -44636,10 +44650,17 @@ static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
// Extract the one scalar element that is actually being stored.
SDLoc DL(MS);
EVT VT = MS->getValue().getValueType();
SDValue Value = MS->getValue();
EVT VT = Value.getValueType();
EVT EltVT = VT.getVectorElementType();
SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
MS->getValue(), VecIndex);
if (EltVT == MVT::i64 && !Subtarget.is64Bit()) {
EltVT = MVT::f64;
EVT CastVT =
EVT::getVectorVT(*DAG.getContext(), EltVT, VT.getVectorNumElements());
Value = DAG.getBitcast(CastVT, Value);
}
SDValue Extract =
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex);
// Store that element at the appropriate offset from the base pointer.
return DAG.getStore(MS->getChain(), DL, Extract, Addr,
@ -44661,7 +44682,7 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
if (Mst->isTruncatingStore())
return SDValue();
if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget))
return ScalarStore;
// If the mask value has been legalized to a non-boolean vector, try to

View File

@ -6504,8 +6504,7 @@ define <2 x i64> @mload_constmask_v2i64(<2 x i64>* %addr, <2 x i64> %dst) {
; X86-AVX512-LABEL: mload_constmask_v2i64:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX512-NEXT: vpinsrd $2, 8(%eax), %xmm0, %xmm0
; X86-AVX512-NEXT: vpinsrd $3, 12(%eax), %xmm0, %xmm0
; X86-AVX512-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; X86-AVX512-NEXT: retl
%res = call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %addr, i32 4, <2 x i1> <i1 0, i1 1>, <2 x i64> %dst)
ret <2 x i64> %res
@ -7109,10 +7108,9 @@ define <4 x i64> @load_one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
; X86-AVX512-LABEL: load_one_mask_bit_set3:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
; X86-AVX512-NEXT: vpinsrd $0, 16(%eax), %xmm1, %xmm1
; X86-AVX512-NEXT: vpinsrd $1, 20(%eax), %xmm1, %xmm1
; X86-AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; X86-AVX512-NEXT: vextractf128 $1, %ymm0, %xmm1
; X86-AVX512-NEXT: vmovlps {{.*#+}} xmm1 = mem[0,1],xmm1[2,3]
; X86-AVX512-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; X86-AVX512-NEXT: retl
%res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x i64> %val)
ret <4 x i64> %res