[X86] Don't widen 128/256-bit strict compares with vXi1 result to 512-bits on KNL.

If we widen the compare we might trigger a spurious exception from the garbage data. We have two choices here. Explicitly force the upper bits to zero. Or use a legacy VEX vcmpps/pd instruction and convert the XMM/YMM result to mask register. I've chosen to go with the second option. I'm not sure which is really best. In some cases we could get rid of the zeroing since the producing instruction probably already zeroed it. But we lose the ability to fold a load. So which is best is dependent on surrounding code. Differential Revision: https://reviews.llvm.org/D74522
2025-01-31 20:51:52 +01:00 · 2020-02-13 11:10:57 -08:00 · 2020-02-13 11:10:57 -08:00 · ab2e139a07
commit ab2e139a07
parent 2357c0bf62
4 changed files with 474 additions and 651 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -21645,8 +21645,14 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
    bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
    SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();

+    // If we have a strict compare with a vXi1 result and the input is 128/256
+    // bits we can't use a masked compare unless we have VLX. If we use a wider
+    // compare like we do for non-strict, we might trigger spurious exceptions
+    // from the upper elements. Instead emit a AVX compare and convert to mask.
    unsigned Opc;
-    if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
+    if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
+        (!IsStrict || Subtarget.hasVLX() ||
+         Op0.getSimpleValueType().is512BitVector())) {
      assert(VT.getVectorNumElements() <= 16);
      Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
    } else {
@ -21742,10 +21748,19 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
            Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
    }

-    // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
-    // result type of SETCC. The bitcast is expected to be optimized away
-    // during combining/isel.
-    Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
+    if (VT.getSizeInBits() > Op.getSimpleValueType().getSizeInBits()) {
+      // We emitted a compare with an XMM/YMM result. Finish converting to a
+      // mask register using a vptestm.
+      EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
+      Cmp = DAG.getBitcast(CastVT, Cmp);
+      Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
+                         DAG.getConstant(0, dl, CastVT), ISD::SETNE);
+    } else {
+      // If this is SSE/AVX CMPP, bitcast the result back to integer to match
+      // the result type of SETCC. The bitcast is expected to be optimized
+      // away during combining/isel.
+      Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
+    }

    if (IsStrict)
      return DAG.getMergeValues({Cmp, Chain}, dl);
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@ -3232,8 +3232,8 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
 multiclass axv512_cmp_packed_cc_no_vlx_lowering<string InstStr,
                                                X86VectorVTInfo Narrow,
                                                X86VectorVTInfo Wide> {
-def : Pat<(Narrow.KVT (X86any_cmpm (Narrow.VT Narrow.RC:$src1),
-                                   (Narrow.VT Narrow.RC:$src2), timm:$cc)),
+def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1),
+                               (Narrow.VT Narrow.RC:$src2), timm:$cc)),
          (COPY_TO_REGCLASS
           (!cast<Instruction>(InstStr#"Zrri")
            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
@ -3250,8 +3250,8 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
           timm:$cc), Narrow.KRC)>;

 // Broadcast load.
-def : Pat<(Narrow.KVT (X86any_cmpm (Narrow.VT Narrow.RC:$src1),
-                                   (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc)),
+def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1),
+                               (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc)),
          (COPY_TO_REGCLASS
           (!cast<Instruction>(InstStr#"Zrmbi")
            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
@ -3266,8 +3266,8 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
           addr:$src2, timm:$cc), Narrow.KRC)>;

 // Commuted with broadcast load.
-def : Pat<(Narrow.KVT (X86any_cmpm (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)),
-                                   (Narrow.VT Narrow.RC:$src1), timm:$cc)),
+def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)),
+                               (Narrow.VT Narrow.RC:$src1), timm:$cc)),
          (COPY_TO_REGCLASS
           (!cast<Instruction>(InstStr#"Zrmbi")
            (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
--- a/test/CodeGen/X86/vec-strict-cmp-128.ll
+++ b/test/CodeGen/X86/vec-strict-cmp-128.ll
--- a/test/CodeGen/X86/vec-strict-cmp-256.ll
+++ b/test/CodeGen/X86/vec-strict-cmp-256.ll