1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-25 04:02:41 +01:00

[X86] Separate the memory size of vzext_load/vextract_store from the element size of the result type. Use them improve the codegen of v2f32 loads/stores with sse1 only.

Summary:
SSE1 only supports v4f32. But does have instructions like movlps/movhps that load/store 64-bits of memory.

This patch breaks the connection between the node VT of the vzext_load/vextract_store patterns and the memory VT. Enabling a v4f32 node with a 64-bit memory VT. I've used i64 as the memory VT here. I've written the PatFrag predicate to just check the store size not the specific VT. I think the VT will only matter for CSE purposes. We could use v2f32, but if we want to start using these operations in more places a simple integer type might make the most sense.

I'd like to maybe use this same thing for SSE2 and later as well, but that will need more work to be supported by EltsFromConsecutiveLoads to avoid regressing lit tests. I'd maybe also like to combine bitcasts with these load/stores nodes now that the types are disconnected. And I'd also like to consider canonicalizing (scalar_to_vector + load) to vzext_load.

If you want I can split the mechanical tablegen stuff where I added the 32/64 off from the sse1 change.

Reviewers: spatel, RKSimon

Reviewed By: RKSimon

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D64528

llvm-svn: 366034
This commit is contained in:
Craig Topper 2019-07-15 02:02:31 +00:00
parent f5ffd2a90c
commit d08e56b62d
6 changed files with 178 additions and 197 deletions

View File

@ -794,6 +794,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
setOperationAction(ISD::STORE, MVT::v2f32, Custom);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
@ -971,11 +974,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// We want to legalize this to an f64 load rather than an i64 load on
// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
// store.
setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
setOperationAction(ISD::LOAD, MVT::v4i16, Custom);
setOperationAction(ISD::LOAD, MVT::v8i8, Custom);
setOperationAction(ISD::STORE, MVT::v2f32, Custom);
setOperationAction(ISD::STORE, MVT::v2i32, Custom);
setOperationAction(ISD::STORE, MVT::v4i16, Custom);
setOperationAction(ISD::STORE, MVT::v8i8, Custom);
@ -21267,21 +21268,29 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
TargetLowering::TypeWidenVector)
return SDValue();
// Widen the vector, cast to a v2x64 type, extract the single 64-bit element
// and store it.
MVT WideVT = MVT::getVectorVT(StoreVT.getVectorElementType(),
StoreVT.getVectorNumElements() * 2);
StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal,
DAG.getUNDEF(StoreVT));
MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
MVT CastVT = MVT::getVectorVT(StVT, 2);
StoredVal = DAG.getBitcast(CastVT, StoredVal);
StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
DAG.getIntPtrConstant(0, dl));
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
St->getPointerInfo(), St->getAlignment(),
St->getMemOperand()->getFlags());
if (Subtarget.hasSSE2()) {
// Widen the vector, cast to a v2x64 type, extract the single 64-bit element
// and store it.
MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64;
MVT CastVT = MVT::getVectorVT(StVT, 2);
StoredVal = DAG.getBitcast(CastVT, StoredVal);
StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal,
DAG.getIntPtrConstant(0, dl));
return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
St->getPointerInfo(), St->getAlignment(),
St->getMemOperand()->getFlags());
}
assert(Subtarget.hasSSE1() && "Expected SSE");
SDVTList Tys = DAG.getVTList(MVT::Other);
SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()};
return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64,
St->getMemOperand());
}
// Lower vector extended loads using a shuffle. If SSSE3 is not available we
@ -28155,19 +28164,28 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
if (!ISD::isNON_EXTLoad(N))
return;
auto *Ld = cast<LoadSDNode>(N);
MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
Ld->getPointerInfo(),
Ld->getAlignment(),
Ld->getMemOperand()->getFlags());
SDValue Chain = Res.getValue(1);
MVT WideVT = MVT::getVectorVT(LdVT, 2);
Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res);
MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(),
VT.getVectorNumElements() * 2);
Res = DAG.getBitcast(CastVT, Res);
if (Subtarget.hasSSE2()) {
MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
Ld->getPointerInfo(), Ld->getAlignment(),
Ld->getMemOperand()->getFlags());
SDValue Chain = Res.getValue(1);
MVT WideVT = MVT::getVectorVT(LdVT, 2);
Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, WideVT, Res);
MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(),
VT.getVectorNumElements() * 2);
Res = DAG.getBitcast(CastVT, Res);
Results.push_back(Res);
Results.push_back(Chain);
return;
}
assert(Subtarget.hasSSE1() && "Expected SSE");
SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other);
SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()};
SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
MVT::i64, Ld->getMemOperand());
Results.push_back(Res);
Results.push_back(Chain);
Results.push_back(Res.getValue(1));
return;
}
}
@ -32016,8 +32034,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// directly if we don't shuffle the lower element and we shuffle the upper
// (zero) elements within themselves.
if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
(V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
(cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() %
MaskEltSizeInBits) == 0) {
unsigned Scale =
cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() /
MaskEltSizeInBits;
ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {

View File

@ -1352,15 +1352,15 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
let Predicates = [HasAVX512] in {
// 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
def : Pat<(v8i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
def : Pat<(v8i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQZm addr:$src)>;
}
let Predicates = [HasVLX] in {
// 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQZ128m addr:$src)>;
def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQZ256m addr:$src)>;
}
let Predicates = [HasVLX, HasBWI] in {
@ -3838,7 +3838,7 @@ def : InstAlias<"vmovq.s\t{$src, $dst|$dst, $src}",
(VMOVPQI2QIZrr VR128X:$dst, VR128X:$src), 0>;
let Predicates = [HasAVX512] in {
def : Pat<(X86vextractstore (v2i64 VR128X:$src), addr:$dst),
def : Pat<(X86vextractstore64 (v2i64 VR128X:$src), addr:$dst),
(VMOVPQI2QIZmr addr:$dst, VR128X:$src)>;
}
@ -3873,7 +3873,7 @@ def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
// AVX-512 MOVSS, MOVSD
//===----------------------------------------------------------------------===//
multiclass avx512_move_scalar<string asm, SDNode OpNode,
multiclass avx512_move_scalar<string asm, SDNode OpNode, PatFrag vzload_frag,
X86VectorVTInfo _> {
let Predicates = [HasAVX512, OptForSize] in
def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
@ -3901,7 +3901,7 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode,
let canFoldAsLoad = 1, isReMaterializable = 1 in {
def rm : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst), (ins _.ScalarMemOp:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
[(set _.RC:$dst, (_.VT (X86vzload addr:$src)))],
[(set _.RC:$dst, (_.VT (vzload_frag addr:$src)))],
_.ExeDomain>, EVEX, Sched<[WriteFLoad]>;
// _alt version uses FR32/FR64 register class.
let isCodeGenOnly = 1 in
@ -3935,10 +3935,10 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode,
NotMemoryFoldable;
}
defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>,
defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, X86vzload32, f32x_info>,
VEX_LIG, XS, EVEX_CD8<32, CD8VT1>;
defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, f64x_info>,
defm VMOVSDZ : avx512_move_scalar<"vmovsd", X86Movsd, X86vzload64, f64x_info>,
VEX_LIG, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
@ -4319,16 +4319,16 @@ let Predicates = [HasAVX512] in {
// Represent the same patterns above but in the form they appear for
// 256-bit types
def : Pat<(v8f32 (X86vzload addr:$src)),
def : Pat<(v8f32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
def : Pat<(v4f64 (X86vzload addr:$src)),
def : Pat<(v4f64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
// Represent the same patterns above but in the form they appear for
// 512-bit types
def : Pat<(v16f32 (X86vzload addr:$src)),
def : Pat<(v16f32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSSZrm addr:$src), sub_xmm)>;
def : Pat<(v8f64 (X86vzload addr:$src)),
def : Pat<(v8f64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
}
@ -4351,21 +4351,21 @@ let Predicates = [HasAVX512] in {
// AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
(VMOVDI2PDIZrm addr:$src)>;
def : Pat<(v4i32 (X86vzload addr:$src)),
def : Pat<(v4i32 (X86vzload32 addr:$src)),
(VMOVDI2PDIZrm addr:$src)>;
def : Pat<(v8i32 (X86vzload addr:$src)),
def : Pat<(v8i32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
def : Pat<(v2f64 (X86vzmovl (v2f64 VR128X:$src))),
(VMOVZPQILo2PQIZrr VR128X:$src)>;
def : Pat<(v2i64 (X86vzload addr:$src)),
def : Pat<(v2i64 (X86vzload64 addr:$src)),
(VMOVQI2PQIZrm addr:$src)>;
def : Pat<(v4i64 (X86vzload addr:$src)),
def : Pat<(v4i64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
// Use regular 128-bit instructions to match 512-bit scalar_to_vec+zext.
def : Pat<(v16i32 (X86vzload addr:$src)),
def : Pat<(v16i32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i32 0), (v4i32 (VMOVDI2PDIZrm addr:$src)), sub_xmm)>;
def : Pat<(v8i64 (X86vzload addr:$src)),
def : Pat<(v8i64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIZrm addr:$src)), sub_xmm)>;
def : Pat<(v4f64 (X86vzmovl (v4f64 VR256X:$src))),
@ -6353,11 +6353,11 @@ let Predicates = [HasAVX512] in {
def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
(bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
(VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
def : Pat<(v2f64 (X86Unpckl VR128X:$src1, (X86vzload addr:$src2))),
def : Pat<(v2f64 (X86Unpckl VR128X:$src1, (X86vzload64 addr:$src2))),
(VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
// VMOVLPD patterns
def : Pat<(v2f64 (X86Movsd VR128X:$src1, (X86vzload addr:$src2))),
def : Pat<(v2f64 (X86Movsd VR128X:$src1, (X86vzload64 addr:$src2))),
(VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
}
@ -8292,47 +8292,47 @@ let Predicates = [HasVLX] in {
}
let Predicates = [HasDQI, HasVLX] in {
def : Pat<(v2i64 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload addr:$src))))),
def : Pat<(v2i64 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
(VCVTPS2QQZ128rm addr:$src)>;
def : Pat<(v2i64 (vselect VK2WM:$mask,
(X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload addr:$src)))),
(X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
VR128X:$src0)),
(VCVTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
def : Pat<(v2i64 (vselect VK2WM:$mask,
(X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload addr:$src)))),
(X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
v2i64x_info.ImmAllZerosV)),
(VCVTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>;
def : Pat<(v2i64 (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload addr:$src))))),
def : Pat<(v2i64 (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
(VCVTPS2UQQZ128rm addr:$src)>;
def : Pat<(v2i64 (vselect VK2WM:$mask,
(X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload addr:$src)))),
(X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
VR128X:$src0)),
(VCVTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
def : Pat<(v2i64 (vselect VK2WM:$mask,
(X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload addr:$src)))),
(X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
v2i64x_info.ImmAllZerosV)),
(VCVTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>;
def : Pat<(v2i64 (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload addr:$src))))),
def : Pat<(v2i64 (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
(VCVTTPS2QQZ128rm addr:$src)>;
def : Pat<(v2i64 (vselect VK2WM:$mask,
(X86cvttp2si (bc_v4f32 (v2f64 (X86vzload addr:$src)))),
(X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
VR128X:$src0)),
(VCVTTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
def : Pat<(v2i64 (vselect VK2WM:$mask,
(X86cvttp2si (bc_v4f32 (v2f64 (X86vzload addr:$src)))),
(X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
v2i64x_info.ImmAllZerosV)),
(VCVTTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>;
def : Pat<(v2i64 (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload addr:$src))))),
def : Pat<(v2i64 (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
(VCVTTPS2UQQZ128rm addr:$src)>;
def : Pat<(v2i64 (vselect VK2WM:$mask,
(X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload addr:$src)))),
(X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
VR128X:$src0)),
(VCVTTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
def : Pat<(v2i64 (vselect VK2WM:$mask,
(X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload addr:$src)))),
(X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
v2i64x_info.ImmAllZerosV)),
(VCVTTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>;
}
@ -8375,25 +8375,25 @@ def : Pat<(v2f64 (X86VUintToFP (v4i32 VR128X:$src1))),
}
let Predicates = [HasVLX] in {
def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(VCVTDQ2PDZ128rm addr:$src)>;
def : Pat<(v2f64 (vselect VK2WM:$mask,
(X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))),
(X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
VR128X:$src0)),
(VCVTDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
def : Pat<(v2f64 (vselect VK2WM:$mask,
(X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))),
(X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
v2f64x_info.ImmAllZerosV)),
(VCVTDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>;
def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(VCVTUDQ2PDZ128rm addr:$src)>;
def : Pat<(v2f64 (vselect VK2WM:$mask,
(X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))),
(X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
VR128X:$src0)),
(VCVTUDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
def : Pat<(v2f64 (vselect VK2WM:$mask,
(X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src)))),
(X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
v2f64x_info.ImmAllZerosV)),
(VCVTUDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>;
}
@ -8562,7 +8562,7 @@ let Predicates = [HasVLX] in {
EVEX_CD8<32, CD8VH>;
// Pattern match vcvtph2ps of a scalar i64 load.
def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))),
def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(VCVTPH2PSZ128rm addr:$src)>;
def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert
(v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
@ -9626,13 +9626,13 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
(!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
def : Pat<(v8i16 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
}
let Predicates = [HasVLX] in {
def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
def : Pat<(v4i32 (InVecOp (v16i8 (vzload_v4i32 addr:$src)))),
def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
(!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (extloadi32i16 addr:$src)))))),
@ -9642,35 +9642,35 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
(!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
def : Pat<(v4i32 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (v8i16 (vzload_v4i32 addr:$src)))),
def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
(!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
def : Pat<(v2i64 (InVecOp (v4i32 (vzload_v2i64 addr:$src)))),
def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
}
let Predicates = [HasVLX] in {
def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v4i32 addr:$src)))),
def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
(!cast<I>(OpcPrefix#BQZ256rm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
}
// 512-bit patterns
@ -10873,7 +10873,7 @@ def : Pat<(v2f64 (X86VBroadcast f64:$src)),
(VMOVDDUPZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
def : Pat<(v2f64 (X86VBroadcast (v2f64 (nonvolatile_load addr:$src)))),
(VMOVDDUPZ128rm addr:$src)>;
def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload addr:$src)))),
def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
(VMOVDDUPZ128rm addr:$src)>;
def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),

View File

@ -99,10 +99,10 @@ def X86insertps : SDNode<"X86ISD::INSERTPS",
def X86vzmovl : SDNode<"X86ISD::VZEXT_MOVL",
SDTypeProfile<1, 1, [SDTCisSameAs<0,1>]>>;
def X86vzload : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def X86vextractstore : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def X86vzld : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def X86vextractst : SDNode<"X86ISD::VEXTRACT_STORE", SDTStore,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def SDTVtrunc : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
SDTCisInt<0>, SDTCisInt<1>,
@ -939,10 +939,20 @@ def bc_v8i64 : PatFrag<(ops node:$in), (v8i64 (bitconvert node:$in))>;
def bc_v8f64 : PatFrag<(ops node:$in), (v8f64 (bitconvert node:$in))>;
def bc_v16f32 : PatFrag<(ops node:$in), (v16f32 (bitconvert node:$in))>;
def vzload_v4i32 : PatFrag<(ops node:$src),
(bitconvert (v4i32 (X86vzload node:$src)))>;
def vzload_v2i64 : PatFrag<(ops node:$src),
(bitconvert (v2i64 (X86vzload node:$src)))>;
def X86vzload32 : PatFrag<(ops node:$src),
(X86vzld node:$src), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 4;
}]>;
def X86vzload64 : PatFrag<(ops node:$src),
(X86vzld node:$src), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
}]>;
def X86vextractstore64 : PatFrag<(ops node:$val, node:$ptr),
(X86vextractst node:$val, node:$ptr), [{
return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
}]>;
def fp32imm0 : PatLeaf<(f32 fpimm), [{

View File

@ -226,14 +226,15 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
// Loading from memory automatically zeroing upper bits.
multiclass sse12_move_rm<RegisterClass RC, ValueType vt, X86MemOperand x86memop,
PatFrag mem_pat, string OpcodeStr, Domain d> {
PatFrag mem_pat, PatFrag vzloadfrag, string OpcodeStr,
Domain d> {
def V#NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst, (vt (X86vzload addr:$src)))], d>,
[(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
VEX, VEX_LIG, Sched<[WriteFLoad]>, VEX_WIG;
def NAME#rm : SI<0x10, MRMSrcMem, (outs VR128:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst, (vt (X86vzload addr:$src)))], d>,
[(set VR128:$dst, (vt (vzloadfrag addr:$src)))], d>,
Sched<[WriteFLoad]>;
// _alt version uses FR32/FR64 register class.
@ -255,9 +256,9 @@ defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd",
SSEPackedDouble, "MOVSD", UseSSE2>, XD;
let canFoldAsLoad = 1, isReMaterializable = 1 in {
defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, "movss",
defm MOVSS : sse12_move_rm<FR32, v4f32, f32mem, loadf32, X86vzload32, "movss",
SSEPackedSingle>, XS;
defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, "movsd",
defm MOVSD : sse12_move_rm<FR64, v2f64, f64mem, loadf64, X86vzload64, "movsd",
SSEPackedDouble>, XD;
}
@ -270,9 +271,9 @@ let Predicates = [UseAVX] in {
// Represent the same patterns above but in the form they appear for
// 256-bit types
def : Pat<(v8f32 (X86vzload addr:$src)),
def : Pat<(v8f32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSSrm addr:$src), sub_xmm)>;
def : Pat<(v4f64 (X86vzload addr:$src)),
def : Pat<(v4f64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSDrm addr:$src), sub_xmm)>;
}
@ -663,6 +664,13 @@ let Predicates = [UseSSE1] in {
def : Pat<(X86Shufp (v4f32 (nonvolatile_load addr:$src2)), VR128:$src1,
(i8 -28)),
(MOVLPSrm VR128:$src1, addr:$src2)>;
def : Pat<(X86Shufp (v4f32 (X86vzload64 addr:$src2)), VR128:$src1, (i8 -28)),
(MOVLPSrm VR128:$src1, addr:$src2)>;
def : Pat<(v4f32 (X86vzload64 addr:$src)),
(MOVLPSrm (v4f32 (V_SET0)), addr:$src)>;
def : Pat<(X86vextractstore64 (v4f32 VR128:$src), addr:$dst),
(MOVLPSmr addr:$dst, VR128:$src)>;
}
//===----------------------------------------------------------------------===//
@ -702,7 +710,7 @@ let Predicates = [UseAVX] in {
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
(bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
(VMOVHPDrm VR128:$src1, addr:$src2)>;
def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload addr:$src2))),
def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
(VMOVHPDrm VR128:$src1, addr:$src2)>;
def : Pat<(store (f64 (extractelt
@ -711,7 +719,7 @@ let Predicates = [UseAVX] in {
(VMOVHPDmr addr:$dst, VR128:$src)>;
// MOVLPD patterns
def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload addr:$src2))),
def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
(VMOVLPDrm VR128:$src1, addr:$src2)>;
}
@ -721,6 +729,12 @@ let Predicates = [UseSSE1] in {
// No need for aligned load, we're only loading 64-bits.
def : Pat<(X86Movlhps VR128:$src1, (v4f32 (nonvolatile_load addr:$src2))),
(MOVHPSrm VR128:$src1, addr:$src2)>;
def : Pat<(X86Movlhps VR128:$src1, (v4f32 (X86vzload64 addr:$src2))),
(MOVHPSrm VR128:$src1, addr:$src2)>;
def : Pat<(X86vextractstore64 (v4f32 (X86Movhlps VR128:$src, VR128:$src)),
addr:$dst),
(MOVHPSmr addr:$dst, VR128:$src)>;
}
let Predicates = [UseSSE2] in {
@ -731,7 +745,7 @@ let Predicates = [UseSSE2] in {
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
(bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
(MOVHPDrm VR128:$src1, addr:$src2)>;
def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload addr:$src2))),
def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
(MOVHPDrm VR128:$src1, addr:$src2)>;
def : Pat<(store (f64 (extractelt
@ -740,7 +754,7 @@ let Predicates = [UseSSE2] in {
(MOVHPDmr addr:$dst, VR128:$src)>;
// MOVLPD patterns
def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload addr:$src2))),
def : Pat<(v2f64 (X86Movsd VR128:$src1, (X86vzload64 addr:$src2))),
(MOVLPDrm VR128:$src1, addr:$src2)>;
}
@ -1631,13 +1645,13 @@ def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
// AVX register conversion intrinsics
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(VCVTDQ2PDrm addr:$src)>;
} // Predicates = [HasAVX, NoVLX]
// SSE2 register conversion intrinsics
let Predicates = [UseSSE2] in {
def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(CVTDQ2PDrm addr:$src)>;
} // Predicates = [UseSSE2]
@ -4124,9 +4138,9 @@ let Predicates = [UseAVX] in {
// These instructions also write zeros in the high part of a 256-bit register.
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
(VMOVDI2PDIrm addr:$src)>;
def : Pat<(v4i32 (X86vzload addr:$src)),
def : Pat<(v4i32 (X86vzload32 addr:$src)),
(VMOVDI2PDIrm addr:$src)>;
def : Pat<(v8i32 (X86vzload addr:$src)),
def : Pat<(v8i32 (X86vzload32 addr:$src)),
(SUBREG_TO_REG (i64 0), (v4i32 (VMOVDI2PDIrm addr:$src)), sub_xmm)>;
}
@ -4138,7 +4152,7 @@ let Predicates = [UseSSE2] in {
(MOV64toPQIrr GR64:$src)>;
def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
(MOVDI2PDIrm addr:$src)>;
def : Pat<(v4i32 (X86vzload addr:$src)),
def : Pat<(v4i32 (X86vzload32 addr:$src)),
(MOVDI2PDIrm addr:$src)>;
}
@ -4206,19 +4220,19 @@ def : InstAlias<"movq.s\t{$src, $dst|$dst, $src}",
(MOVPQI2QIrr VR128:$dst, VR128:$src), 0>;
let Predicates = [UseAVX] in {
def : Pat<(v2i64 (X86vzload addr:$src)),
def : Pat<(v2i64 (X86vzload64 addr:$src)),
(VMOVQI2PQIrm addr:$src)>;
def : Pat<(v4i64 (X86vzload addr:$src)),
def : Pat<(v4i64 (X86vzload64 addr:$src)),
(SUBREG_TO_REG (i64 0), (v2i64 (VMOVQI2PQIrm addr:$src)), sub_xmm)>;
def : Pat<(X86vextractstore (v2i64 VR128:$src), addr:$dst),
def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
(VMOVPQI2QImr addr:$dst, VR128:$src)>;
}
let Predicates = [UseSSE2] in {
def : Pat<(v2i64 (X86vzload addr:$src)), (MOVQI2PQIrm addr:$src)>;
def : Pat<(v2i64 (X86vzload64 addr:$src)), (MOVQI2PQIrm addr:$src)>;
def : Pat<(X86vextractstore (v2i64 VR128:$src), addr:$dst),
def : Pat<(X86vextractstore64 (v2i64 VR128:$src), addr:$dst),
(MOVPQI2QImr addr:$dst, VR128:$src)>;
}
@ -4368,7 +4382,7 @@ defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))),
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
def : Pat<(X86Movddup (v2f64 (X86vzload addr:$src))),
def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
}
@ -4376,7 +4390,7 @@ let Predicates = [UseSSE3] in {
// No need for aligned memory as this only loads 64-bits.
def : Pat<(X86Movddup (v2f64 (nonvolatile_load addr:$src))),
(MOVDDUPrm addr:$src)>;
def : Pat<(X86Movddup (v2f64 (X86vzload addr:$src))),
def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
(MOVDDUPrm addr:$src)>;
}
@ -4953,7 +4967,7 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
def : Pat<(v8i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
def : Pat<(v8i32 (InVecOp (v16i8 (X86vzload64 addr:$src)))),
(!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
@ -4961,12 +4975,12 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
def : Pat<(v4i64 (InVecOp (v16i8 (X86vzload64 addr:$src)))),
(!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
def : Pat<(v4i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
def : Pat<(v4i64 (InVecOp (v8i16 (X86vzload64 addr:$src)))),
(!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
}
}
@ -5018,7 +5032,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
(!cast<I>(OpcPrefix#BWrm) addr:$src)>;
def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#BWrm) addr:$src)>;
def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#BWrm) addr:$src)>;
def : Pat<(v8i16 (ExtOp (loadv16i8 addr:$src))),
(!cast<I>(OpcPrefix#BWrm) addr:$src)>;
@ -5026,7 +5040,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#BDrm) addr:$src)>;
def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v4i32 addr:$src)))),
def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (X86vzload32 addr:$src))))),
(!cast<I>(OpcPrefix#BDrm) addr:$src)>;
def : Pat<(v4i32 (ExtOp (loadv16i8 addr:$src))),
(!cast<I>(OpcPrefix#BDrm) addr:$src)>;
@ -5040,14 +5054,14 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
(!cast<I>(OpcPrefix#WDrm) addr:$src)>;
def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#WDrm) addr:$src)>;
def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#WDrm) addr:$src)>;
def : Pat<(v4i32 (ExtOp (loadv8i16 addr:$src))),
(!cast<I>(OpcPrefix#WDrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
(!cast<I>(OpcPrefix#WQrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v4i32 addr:$src)))),
def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (X86vzload32 addr:$src))))),
(!cast<I>(OpcPrefix#WQrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (loadv8i16 addr:$src))),
(!cast<I>(OpcPrefix#WQrm) addr:$src)>;
@ -5056,7 +5070,7 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
(!cast<I>(OpcPrefix#DQrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
(!cast<I>(OpcPrefix#DQrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(!cast<I>(OpcPrefix#DQrm) addr:$src)>;
def : Pat<(v2i64 (ExtOp (loadv4i32 addr:$src))),
(!cast<I>(OpcPrefix#DQrm) addr:$src)>;
@ -7261,10 +7275,10 @@ let Predicates = [HasF16C, NoVLX] in {
WriteCvtPS2PHYSt>, VEX_L;
// Pattern match vcvtph2ps of a scalar i64 load.
def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))),
def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
(VCVTPH2PSrm addr:$src)>;
def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert
(v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16
(v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(VCVTPH2PSrm addr:$src)>;
def : Pat<(store (f64 (extractelt
@ -7436,9 +7450,9 @@ defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
let Predicates = [HasAVX2, NoVLX] in {
// 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQrm addr:$src)>;
def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
(VPBROADCASTQYrm addr:$src)>;
def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
@ -7550,7 +7564,7 @@ let Predicates = [HasAVX, NoVLX] in {
(VMOVDDUPrr VR128:$src)>;
def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
(VMOVDDUPrm addr:$src)>;
def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload addr:$src)))),
def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
(VMOVDDUPrm addr:$src)>;
}

View File

@ -1319,14 +1319,8 @@ define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
; X86-SSE-LABEL: test_mm_loadh_pi:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-SSE-NEXT: movss (%eax), %xmm1 # encoding: [0xf3,0x0f,0x10,0x08]
; X86-SSE-NEXT: # xmm1 = mem[0],zero,zero,zero
; X86-SSE-NEXT: movss 4(%eax), %xmm2 # encoding: [0xf3,0x0f,0x10,0x50,0x04]
; X86-SSE-NEXT: # xmm2 = mem[0],zero,zero,zero
; X86-SSE-NEXT: shufps $0, %xmm1, %xmm2 # encoding: [0x0f,0xc6,0xd1,0x00]
; X86-SSE-NEXT: # xmm2 = xmm2[0,0],xmm1[0,0]
; X86-SSE-NEXT: shufps $36, %xmm2, %xmm0 # encoding: [0x0f,0xc6,0xc2,0x24]
; X86-SSE-NEXT: # xmm0 = xmm0[0,1],xmm2[2,0]
; X86-SSE-NEXT: movhps (%eax), %xmm0 # encoding: [0x0f,0x16,0x00]
; X86-SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,1]
; X86-SSE-NEXT: retl # encoding: [0xc3]
;
; X86-AVX1-LABEL: test_mm_loadh_pi:
@ -1345,18 +1339,8 @@ define <4 x float> @test_mm_loadh_pi(<4 x float> %a0, x86_mmx* %a1) {
;
; X64-SSE-LABEL: test_mm_loadh_pi:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
; X64-SSE-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xf8]
; X64-SSE-NEXT: shrq $32, %rax # encoding: [0x48,0xc1,0xe8,0x20]
; X64-SSE-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xfc]
; X64-SSE-NEXT: movss -{{[0-9]+}}(%rsp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0xf8]
; X64-SSE-NEXT: # xmm1 = mem[0],zero,zero,zero
; X64-SSE-NEXT: movss -{{[0-9]+}}(%rsp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0xfc]
; X64-SSE-NEXT: # xmm2 = mem[0],zero,zero,zero
; X64-SSE-NEXT: unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
; X64-SSE-NEXT: # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X64-SSE-NEXT: movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
; X64-SSE-NEXT: # xmm0 = xmm0[0],xmm1[0]
; X64-SSE-NEXT: movhps (%rdi), %xmm0 # encoding: [0x0f,0x16,0x07]
; X64-SSE-NEXT: # xmm0 = xmm0[0,1],mem[0,1]
; X64-SSE-NEXT: retq # encoding: [0xc3]
;
; X64-AVX1-LABEL: test_mm_loadh_pi:
@ -1381,15 +1365,8 @@ define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
; X86-SSE-LABEL: test_mm_loadl_pi:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-SSE-NEXT: movss (%eax), %xmm2 # encoding: [0xf3,0x0f,0x10,0x10]
; X86-SSE-NEXT: # xmm2 = mem[0],zero,zero,zero
; X86-SSE-NEXT: movss 4(%eax), %xmm1 # encoding: [0xf3,0x0f,0x10,0x48,0x04]
; X86-SSE-NEXT: # xmm1 = mem[0],zero,zero,zero
; X86-SSE-NEXT: shufps $0, %xmm2, %xmm1 # encoding: [0x0f,0xc6,0xca,0x00]
; X86-SSE-NEXT: # xmm1 = xmm1[0,0],xmm2[0,0]
; X86-SSE-NEXT: shufps $226, %xmm0, %xmm1 # encoding: [0x0f,0xc6,0xc8,0xe2]
; X86-SSE-NEXT: # xmm1 = xmm1[2,0],xmm0[2,3]
; X86-SSE-NEXT: movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
; X86-SSE-NEXT: movlps (%eax), %xmm0 # encoding: [0x0f,0x12,0x00]
; X86-SSE-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
; X86-SSE-NEXT: retl # encoding: [0xc3]
;
; X86-AVX1-LABEL: test_mm_loadl_pi:
@ -1408,19 +1385,8 @@ define <4 x float> @test_mm_loadl_pi(<4 x float> %a0, x86_mmx* %a1) {
;
; X64-SSE-LABEL: test_mm_loadl_pi:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
; X64-SSE-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xf8]
; X64-SSE-NEXT: shrq $32, %rax # encoding: [0x48,0xc1,0xe8,0x20]
; X64-SSE-NEXT: movl %eax, -{{[0-9]+}}(%rsp) # encoding: [0x89,0x44,0x24,0xfc]
; X64-SSE-NEXT: movss -{{[0-9]+}}(%rsp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0xf8]
; X64-SSE-NEXT: # xmm1 = mem[0],zero,zero,zero
; X64-SSE-NEXT: movss -{{[0-9]+}}(%rsp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0xfc]
; X64-SSE-NEXT: # xmm2 = mem[0],zero,zero,zero
; X64-SSE-NEXT: unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
; X64-SSE-NEXT: # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; X64-SSE-NEXT: shufps $228, %xmm0, %xmm1 # encoding: [0x0f,0xc6,0xc8,0xe4]
; X64-SSE-NEXT: # xmm1 = xmm1[0,1],xmm0[2,3]
; X64-SSE-NEXT: movaps %xmm1, %xmm0 # encoding: [0x0f,0x28,0xc1]
; X64-SSE-NEXT: movlps (%rdi), %xmm0 # encoding: [0x0f,0x12,0x07]
; X64-SSE-NEXT: # xmm0 = mem[0,1],xmm0[2,3]
; X64-SSE-NEXT: retq # encoding: [0xc3]
;
; X64-AVX1-LABEL: test_mm_loadl_pi:
@ -2818,13 +2784,7 @@ define void @test_mm_storeh_pi2(x86_mmx *%a0, <4 x float> %a1) nounwind {
; X86-SSE-LABEL: test_mm_storeh_pi2:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-SSE-NEXT: movaps %xmm0, %xmm1 # encoding: [0x0f,0x28,0xc8]
; X86-SSE-NEXT: movhlps %xmm0, %xmm1 # encoding: [0x0f,0x12,0xc8]
; X86-SSE-NEXT: # xmm1 = xmm0[1],xmm1[1]
; X86-SSE-NEXT: shufps $231, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0xe7]
; X86-SSE-NEXT: # xmm0 = xmm0[3,1,2,3]
; X86-SSE-NEXT: movss %xmm0, 4(%eax) # encoding: [0xf3,0x0f,0x11,0x40,0x04]
; X86-SSE-NEXT: movss %xmm1, (%eax) # encoding: [0xf3,0x0f,0x11,0x08]
; X86-SSE-NEXT: movhps %xmm0, (%eax) # encoding: [0x0f,0x17,0x00]
; X86-SSE-NEXT: retl # encoding: [0xc3]
;
; X86-AVX1-LABEL: test_mm_storeh_pi2:
@ -2841,11 +2801,7 @@ define void @test_mm_storeh_pi2(x86_mmx *%a0, <4 x float> %a1) nounwind {
;
; X64-SSE-LABEL: test_mm_storeh_pi2:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movhlps %xmm0, %xmm0 # encoding: [0x0f,0x12,0xc0]
; X64-SSE-NEXT: # xmm0 = xmm0[1,1]
; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x29,0x44,0x24,0xe8]
; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8b,0x44,0x24,0xe8]
; X64-SSE-NEXT: movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
; X64-SSE-NEXT: movhps %xmm0, (%rdi) # encoding: [0x0f,0x17,0x07]
; X64-SSE-NEXT: retq # encoding: [0xc3]
;
; X64-AVX1-LABEL: test_mm_storeh_pi2:
@ -2922,10 +2878,7 @@ define void @test_mm_storel_pi2(x86_mmx *%a0, <4 x float> %a1) nounwind {
; X86-SSE-LABEL: test_mm_storel_pi2:
; X86-SSE: # %bb.0:
; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
; X86-SSE-NEXT: movss %xmm0, (%eax) # encoding: [0xf3,0x0f,0x11,0x00]
; X86-SSE-NEXT: shufps $229, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0xe5]
; X86-SSE-NEXT: # xmm0 = xmm0[1,1,2,3]
; X86-SSE-NEXT: movss %xmm0, 4(%eax) # encoding: [0xf3,0x0f,0x11,0x40,0x04]
; X86-SSE-NEXT: movlps %xmm0, (%eax) # encoding: [0x0f,0x13,0x00]
; X86-SSE-NEXT: retl # encoding: [0xc3]
;
; X86-AVX1-LABEL: test_mm_storel_pi2:
@ -2942,9 +2895,7 @@ define void @test_mm_storel_pi2(x86_mmx *%a0, <4 x float> %a1) nounwind {
;
; X64-SSE-LABEL: test_mm_storel_pi2:
; X64-SSE: # %bb.0:
; X64-SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) # encoding: [0x0f,0x29,0x44,0x24,0xe8]
; X64-SSE-NEXT: movq -{{[0-9]+}}(%rsp), %rax # encoding: [0x48,0x8b,0x44,0x24,0xe8]
; X64-SSE-NEXT: movq %rax, (%rdi) # encoding: [0x48,0x89,0x07]
; X64-SSE-NEXT: movlps %xmm0, (%rdi) # encoding: [0x0f,0x13,0x07]
; X64-SSE-NEXT: retq # encoding: [0xc3]
;
; X64-AVX1-LABEL: test_mm_storel_pi2:

View File

@ -230,15 +230,7 @@ define <4 x float> @insert_mem_and_zero_v4f32(float* %ptr) {
define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
; SSE1-LABEL: insert_mem_lo_v4f32:
; SSE1: # %bb.0:
; SSE1-NEXT: movq (%rdi), %rax
; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
; SSE1-NEXT: shrq $32, %rax
; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
; SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE1-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
; SSE1-NEXT: movaps %xmm1, %xmm0
; SSE1-NEXT: movlps {{.*#+}} xmm0 = mem[0,1],xmm0[2,3]
; SSE1-NEXT: retq
%a = load <2 x float>, <2 x float>* %ptr
%v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@ -249,14 +241,7 @@ define <4 x float> @insert_mem_lo_v4f32(<2 x float>* %ptr, <4 x float> %b) {
define <4 x float> @insert_mem_hi_v4f32(<2 x float>* %ptr, <4 x float> %b) {
; SSE1-LABEL: insert_mem_hi_v4f32:
; SSE1: # %bb.0:
; SSE1-NEXT: movq (%rdi), %rax
; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
; SSE1-NEXT: shrq $32, %rax
; SSE1-NEXT: movl %eax, -{{[0-9]+}}(%rsp)
; SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
; SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
; SSE1-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE1-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE1-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
; SSE1-NEXT: retq
%a = load <2 x float>, <2 x float>* %ptr
%v = shufflevector <2 x float> %a, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>