[X86] Lower v1i1/v2i1/v4i1/v8i1 load/stores to i8 load/store during op legalization if AVX512DQ is not supported.

We were previously doing this with isel patterns. Moving it to op legalization gives us chance to see the required bitcast earlier. And it lets us remove some isel patterns. llvm-svn: 326669
2024-11-26 04:32:44 +01:00 · 2018-03-04 01:48:00 +00:00 · 2018-03-04 01:48:00 +00:00 · 55480673fc
commit 55480673fc
parent 2de161d33b
3 changed files with 68 additions and 38 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -1177,6 +1177,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
    setOperationAction(ISD::FP_TO_SINT,         MVT::v2i1,  Custom);
    setOperationAction(ISD::FP_TO_UINT,         MVT::v2i1,  Custom);

+    // There is no byte sized k-register load or store without AVX512DQ.
+    if (!Subtarget.hasDQI()) {
+      setOperationAction(ISD::LOAD, MVT::v1i1, Custom);
+      setOperationAction(ISD::LOAD, MVT::v2i1, Custom);
+      setOperationAction(ISD::LOAD, MVT::v4i1, Custom);
+      setOperationAction(ISD::LOAD, MVT::v8i1, Custom);
+
+      setOperationAction(ISD::STORE, MVT::v1i1, Custom);
+      setOperationAction(ISD::STORE, MVT::v2i1, Custom);
+      setOperationAction(ISD::STORE, MVT::v4i1, Custom);
+      setOperationAction(ISD::STORE, MVT::v8i1, Custom);
+    }
+
    // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors.
    for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
      setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
@ -18983,6 +18996,30 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }

+static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
+                          SelectionDAG &DAG) {
+  StoreSDNode *St = cast<StoreSDNode>(Op.getNode());
+  EVT VT = St->getValue().getValueType();
+  SDLoc dl(St);
+  SDValue StoredVal = St->getOperand(1);
+
+  // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
+  assert(VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+         VT.getVectorNumElements() <= 8 && "Unexpected VT");
+  assert(!St->isTruncatingStore() && "Expected non-truncating store");
+  assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
+         "Expected AVX512F without AVX512DQI");
+
+  StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
+                          DAG.getUNDEF(MVT::v8i1), StoredVal,
+                          DAG.getIntPtrConstant(0, dl));
+  StoredVal = DAG.getBitcast(MVT::i8, StoredVal);
+
+  return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
+                      St->getPointerInfo(), St->getAlignment(),
+                      St->getMemOperand()->getFlags());
+}
+
 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
 // may emit an illegal shuffle but the expansion is still better than scalar
 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
@ -18990,20 +19027,41 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
 // TODO: It is possible to support ZExt by zeroing the undef values during
 // the shuffle phase or after the shuffle.
-static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
+static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
                                 SelectionDAG &DAG) {
  MVT RegVT = Op.getSimpleValueType();
  assert(RegVT.isVector() && "We only custom lower vector sext loads.");
  assert(RegVT.isInteger() &&
         "We only custom lower integer vector sext loads.");

-  // Nothing useful we can do without SSE2 shuffles.
-  assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
-
  LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
  SDLoc dl(Ld);
  EVT MemVT = Ld->getMemoryVT();

+  // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads.
+  if (RegVT.isVector() && RegVT.getVectorElementType() == MVT::i1) {
+    assert(EVT(RegVT) == MemVT && "Expected non-extending load");
+    assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT");
+    assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&
+           "Expected AVX512F without AVX512DQI");
+
+    SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
+                                Ld->getPointerInfo(), Ld->getAlignment(),
+                                Ld->getMemOperand()->getFlags());
+
+    // Replace chain users with the new chain.
+    assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!");
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewLd.getValue(1));
+
+    SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT,
+                                  DAG.getBitcast(MVT::v8i1, NewLd),
+                                  DAG.getIntPtrConstant(0, dl));
+    return DAG.getMergeValues({Extract, NewLd.getValue(1)}, dl);
+  }
+
+  // Nothing useful we can do without SSE2 shuffles.
+  assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
+
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  unsigned RegSz = RegVT.getSizeInBits();

@ -24766,7 +24824,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
  case ISD::FP_TO_SINT:
  case ISD::FP_TO_UINT:         return LowerFP_TO_INT(Op, DAG);
  case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
-  case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
+  case ISD::LOAD:               return LowerLoad(Op, Subtarget, DAG);
+  case ISD::STORE:              return LowerStore(Op, Subtarget, DAG);
  case ISD::FABS:
  case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
  case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@ -2816,40 +2816,15 @@ let Predicates = [HasDQI] in {
  def : Pat<(store VK1:$src, addr:$dst),
            (KMOVBmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK8))>;

+  def : Pat<(v1i1 (load addr:$src)),
+            (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK1)>;
  def : Pat<(v2i1 (load addr:$src)),
            (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK2)>;
  def : Pat<(v4i1 (load addr:$src)),
            (COPY_TO_REGCLASS (KMOVBkm addr:$src), VK4)>;
 }
-let Predicates = [HasAVX512, NoDQI] in {
-  def : Pat<(store VK1:$src, addr:$dst),
-            (MOV8mr addr:$dst,
-             (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)),
-              sub_8bit)))>;
-  def : Pat<(store VK2:$src, addr:$dst),
-            (MOV8mr addr:$dst,
-             (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK2:$src, GR32)),
-              sub_8bit)))>;
-  def : Pat<(store VK4:$src, addr:$dst),
-            (MOV8mr addr:$dst,
-             (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK4:$src, GR32)),
-              sub_8bit)))>;
-  def : Pat<(store VK8:$src, addr:$dst),
-            (MOV8mr addr:$dst,
-             (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)),
-              sub_8bit)))>;
-
-  def : Pat<(v8i1 (load addr:$src)),
-            (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>;
-  def : Pat<(v2i1 (load addr:$src)),
-            (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK2)>;
-  def : Pat<(v4i1 (load addr:$src)),
-            (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK4)>;
-}

 let Predicates = [HasAVX512] in {
-  def : Pat<(v1i1 (load addr:$src)),
-            (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK1)>;
  def : Pat<(v8i1 (bitconvert (i8 (load addr:$src)))),
            (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>;
 }
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@ -348,9 +348,7 @@ define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
 define i8 @conv1(<8 x i1>* %R) {
 ; KNL-LABEL: conv1:
 ; KNL:       ## %bb.0: ## %entry
-; KNL-NEXT:    kxnorw %k0, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    movb %al, (%rdi)
+; KNL-NEXT:    movb $-1, (%rdi)
 ; KNL-NEXT:    movb $-2, -{{[0-9]+}}(%rsp)
 ; KNL-NEXT:    movb $-2, %al
 ; KNL-NEXT:    retq
@ -365,9 +363,7 @@ define i8 @conv1(<8 x i1>* %R) {
 ;
 ; AVX512BW-LABEL: conv1:
 ; AVX512BW:       ## %bb.0: ## %entry
-; AVX512BW-NEXT:    kxnorw %k0, %k0, %k0
-; AVX512BW-NEXT:    kmovd %k0, %eax
-; AVX512BW-NEXT:    movb %al, (%rdi)
+; AVX512BW-NEXT:    movb $-1, (%rdi)
 ; AVX512BW-NEXT:    movb $-2, -{{[0-9]+}}(%rsp)
 ; AVX512BW-NEXT:    movb $-2, %al
 ; AVX512BW-NEXT:    retq