[X86] Remove isel patterns for (X86VBroadcast (i16 (trunc (i32 (load))))). Replace with a DAG combine to form VBROADCAST_LOAD.

isTypeDesirableForOp prevents loads from being shrunk to i16 by DAG combine. Because of this we can't just match the broadcast and a scalar load. So look for broadcast+truncate+load and form a vbroadcast_load during DAG combine. This replaces what was previously done as an isel pattern and I think fixes it so we won't change the size of a volatile load. But my main motivation is just to clean up our isel patterns.
2024-11-22 10:42:39 +01:00 · 2020-03-10 00:00:26 -07:00 · 2020-03-10 00:00:26 -07:00 · bf3b71f64b
commit bf3b71f64b
parent aa2a6dc76f
3 changed files with 22 additions and 10 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -35171,6 +35171,28 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
      return N; // Return N so it doesn't get rechecked!
    }

+    // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
+    // i16. So shrink it ourselves if we can make a broadcast_load.
+    if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
+        Src.hasOneUse() && ISD::isNormalLoad(Src.getOperand(0).getNode()) &&
+        Src.getOperand(0).hasOneUse()) {
+      assert(Subtarget.hasAVX2() && "Expected AVX2");
+      LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
+      if (LN->isSimple()) {
+        SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+        SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+        SDValue BcastLd =
+            DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
+                                    MVT::i16, LN->getPointerInfo(),
+                                    LN->getAlignment(),
+                                    LN->getMemOperand()->getFlags());
+        DCI.CombineTo(N.getNode(), BcastLd);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+        DCI.recursivelyDeleteUnusedNodes(LN);
+        return N; // Return N so it doesn't get rechecked!
+      }
+    }
+
    // vbroadcast(vzload X) -> vbroadcast_load X
    if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
      MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@ -1426,10 +1426,6 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
 let Predicates = [HasVLX, HasBWI] in {
  // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
  // This means we'll encounter truncated i32 loads; match that here.
-  def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
-            (VPBROADCASTWZ128rm addr:$src)>;
-  def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
-            (VPBROADCASTWZ256rm addr:$src)>;
  def : Pat<(v8i16 (X86VBroadcast
              (i16 (trunc (i32 (extloadi16 addr:$src)))))),
            (VPBROADCASTWZ128rm addr:$src)>;
@ -1446,8 +1442,6 @@ let Predicates = [HasVLX, HasBWI] in {
 let Predicates = [HasBWI] in {
  // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
  // This means we'll encounter truncated i32 loads; match that here.
-  def : Pat<(v32i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
-            (VPBROADCASTWZrm addr:$src)>;
  def : Pat<(v32i16 (X86VBroadcast
              (i16 (trunc (i32 (extloadi16 addr:$src)))))),
            (VPBROADCASTWZrm addr:$src)>;
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@ -7517,10 +7517,6 @@ defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastl
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
  // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
  // This means we'll encounter truncated i32 loads; match that here.
-  def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
-            (VPBROADCASTWrm addr:$src)>;
-  def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
-            (VPBROADCASTWYrm addr:$src)>;
  def : Pat<(v8i16 (X86VBroadcast
              (i16 (trunc (i32 (extloadi16 addr:$src)))))),
            (VPBROADCASTWrm addr:$src)>;