From 55480673fc1dcc8ede74fd427eb51eea02a4e707 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Sun, 4 Mar 2018 01:48:00 +0000 Subject: [PATCH] [X86] Lower v1i1/v2i1/v4i1/v8i1 load/stores to i8 load/store during op legalization if AVX512DQ is not supported. We were previously doing this with isel patterns. Moving it to op legalization gives us chance to see the required bitcast earlier. And it lets us remove some isel patterns. llvm-svn: 326669 --- lib/Target/X86/X86ISelLowering.cpp | 69 +++++++++++++++++++++++++++--- lib/Target/X86/X86InstrAVX512.td | 29 +------------ test/CodeGen/X86/avx512-mask-op.ll | 8 +--- 3 files changed, 68 insertions(+), 38 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 18f32768a06..e8b3f3656b6 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1177,6 +1177,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom); setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom); + // There is no byte sized k-register load or store without AVX512DQ. + if (!Subtarget.hasDQI()) { + setOperationAction(ISD::LOAD, MVT::v1i1, Custom); + setOperationAction(ISD::LOAD, MVT::v2i1, Custom); + setOperationAction(ISD::LOAD, MVT::v4i1, Custom); + setOperationAction(ISD::LOAD, MVT::v8i1, Custom); + + setOperationAction(ISD::STORE, MVT::v1i1, Custom); + setOperationAction(ISD::STORE, MVT::v2i1, Custom); + setOperationAction(ISD::STORE, MVT::v4i1, Custom); + setOperationAction(ISD::STORE, MVT::v8i1, Custom); + } + // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { setOperationAction(ISD::SIGN_EXTEND, VT, Custom); @@ -18983,6 +18996,30 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } +static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, + SelectionDAG &DAG) { + StoreSDNode *St = cast(Op.getNode()); + EVT VT = St->getValue().getValueType(); + SDLoc dl(St); + SDValue StoredVal = St->getOperand(1); + + // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads. + assert(VT.isVector() && VT.getVectorElementType() == MVT::i1 && + VT.getVectorNumElements() <= 8 && "Unexpected VT"); + assert(!St->isTruncatingStore() && "Expected non-truncating store"); + assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && + "Expected AVX512F without AVX512DQI"); + + StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1, + DAG.getUNDEF(MVT::v8i1), StoredVal, + DAG.getIntPtrConstant(0, dl)); + StoredVal = DAG.getBitcast(MVT::i8, StoredVal); + + return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), + St->getPointerInfo(), St->getAlignment(), + St->getMemOperand()->getFlags()); +} + // Lower vector extended loads using a shuffle. If SSSE3 is not available we // may emit an illegal shuffle but the expansion is still better than scalar // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise @@ -18990,20 +19027,41 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, // FIXME: Is the expansion actually better than scalar code? It doesn't seem so. // TODO: It is possible to support ZExt by zeroing the undef values during // the shuffle phase or after the shuffle. -static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget, +static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { MVT RegVT = Op.getSimpleValueType(); assert(RegVT.isVector() && "We only custom lower vector sext loads."); assert(RegVT.isInteger() && "We only custom lower integer vector sext loads."); - // Nothing useful we can do without SSE2 shuffles. - assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2."); - LoadSDNode *Ld = cast(Op.getNode()); SDLoc dl(Ld); EVT MemVT = Ld->getMemoryVT(); + // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads. + if (RegVT.isVector() && RegVT.getVectorElementType() == MVT::i1) { + assert(EVT(RegVT) == MemVT && "Expected non-extending load"); + assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT"); + assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() && + "Expected AVX512F without AVX512DQI"); + + SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(), + Ld->getPointerInfo(), Ld->getAlignment(), + Ld->getMemOperand()->getFlags()); + + // Replace chain users with the new chain. + assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!"); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewLd.getValue(1)); + + SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT, + DAG.getBitcast(MVT::v8i1, NewLd), + DAG.getIntPtrConstant(0, dl)); + return DAG.getMergeValues({Extract, NewLd.getValue(1)}, dl); + } + + // Nothing useful we can do without SSE2 shuffles. + assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2."); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned RegSz = RegVT.getSizeInBits(); @@ -24766,7 +24824,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FP_TO_SINT: case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG); - case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG); + case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG); + case ISD::STORE: return LowerStore(Op, Subtarget, DAG); case ISD::FABS: case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 91e4aca1489..59ed5fd6f31 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -2816,40 +2816,15 @@ let Predicates = [HasDQI] in { def : Pat<(store VK1:$src, addr:$dst), (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK8))>; + def : Pat<(v1i1 (load addr:$src)), + (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK1)>; def : Pat<(v2i1 (load addr:$src)), (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK2)>; def : Pat<(v4i1 (load addr:$src)), (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK4)>; } -let Predicates = [HasAVX512, NoDQI] in { - def : Pat<(store VK1:$src, addr:$dst), - (MOV8mr addr:$dst, - (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), - sub_8bit)))>; - def : Pat<(store VK2:$src, addr:$dst), - (MOV8mr addr:$dst, - (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK2:$src, GR32)), - sub_8bit)))>; - def : Pat<(store VK4:$src, addr:$dst), - (MOV8mr addr:$dst, - (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK4:$src, GR32)), - sub_8bit)))>; - def : Pat<(store VK8:$src, addr:$dst), - (MOV8mr addr:$dst, - (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)), - sub_8bit)))>; - - def : Pat<(v8i1 (load addr:$src)), - (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>; - def : Pat<(v2i1 (load addr:$src)), - (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK2)>; - def : Pat<(v4i1 (load addr:$src)), - (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK4)>; -} let Predicates = [HasAVX512] in { - def : Pat<(v1i1 (load addr:$src)), - (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK1)>; def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))), (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>; } diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index 88d5ed4f7ad..1038c90ab9d 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -348,9 +348,7 @@ define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) { define i8 @conv1(<8 x i1>* %R) { ; KNL-LABEL: conv1: ; KNL: ## %bb.0: ## %entry -; KNL-NEXT: kxnorw %k0, %k0, %k0 -; KNL-NEXT: kmovw %k0, %eax -; KNL-NEXT: movb %al, (%rdi) +; KNL-NEXT: movb $-1, (%rdi) ; KNL-NEXT: movb $-2, -{{[0-9]+}}(%rsp) ; KNL-NEXT: movb $-2, %al ; KNL-NEXT: retq @@ -365,9 +363,7 @@ define i8 @conv1(<8 x i1>* %R) { ; ; AVX512BW-LABEL: conv1: ; AVX512BW: ## %bb.0: ## %entry -; AVX512BW-NEXT: kxnorw %k0, %k0, %k0 -; AVX512BW-NEXT: kmovd %k0, %eax -; AVX512BW-NEXT: movb %al, (%rdi) +; AVX512BW-NEXT: movb $-1, (%rdi) ; AVX512BW-NEXT: movb $-2, -{{[0-9]+}}(%rsp) ; AVX512BW-NEXT: movb $-2, %al ; AVX512BW-NEXT: retq