[X86] Attempt to pre-truncate arithmetic operations if useful

In some cases its more efficient to combine TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) ) if the binop is legal for the truncated types. This is true for vector integer multiplication (especially vXi64), as well as ADD/AND/XOR/OR in cases where we only need to truncate one of the inputs at runtime (e.g. a duplicated input or an one use constant we can fold). Further work could be done here - scalar cases (especially i64) could often benefit (if we avoid partial registers etc.), other opcodes, and better analysis of when truncating the inputs reduces costs. I have considered implementing this for all targets within the DAGCombiner but wasn't sure we could devise a suitable cost model system that would give us the range we need. Differential Revision: https://reviews.llvm.org/D28219 llvm-svn: 290947
2024-11-22 18:54:02 +01:00 · 2017-01-04 08:05:42 +00:00 · 2017-01-04 08:05:42 +00:00 · 31a63f3400
commit 31a63f3400
parent 92d154bbfd
4 changed files with 506 additions and 1006 deletions
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@ -31833,6 +31833,83 @@ static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
  return SDValue();
 }

+/// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
+/// the codegen.
+/// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
+static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
+                                          const X86Subtarget &Subtarget,
+                                          SDLoc &DL) {
+  assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
+  SDValue Src = N->getOperand(0);
+  unsigned Opcode = Src.getOpcode();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  EVT VT = N->getValueType(0);
+  EVT SrcVT = Src.getValueType();
+
+  auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) {
+    // TODO: Add extra cases where we can truncate both inputs for the
+    // cost of one (or none).
+    // e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y )
+    if (Op0 == Op1)
+      return true;
+
+    SDValue BC0 = peekThroughOneUseBitcasts(Op0);
+    SDValue BC1 = peekThroughOneUseBitcasts(Op1);
+    return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
+           ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
+  };
+
+  auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
+    SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
+    SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
+    return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
+  };
+
+  // Don't combine if the operation has other uses.
+  if (!N->isOnlyUserOf(Src.getNode()))
+    return SDValue();
+
+  // Only support vector truncation for now.
+  // TODO: i64 scalar math would benefit as well.
+  if (!VT.isVector())
+    return SDValue();
+
+  // In most cases its only worth pre-truncating if we're only facing the cost
+  // of one truncation.
+  // i.e. if one of the inputs will constant fold or the input is repeated.
+  switch (Opcode) {
+  case ISD::AND:
+  case ISD::XOR:
+  case ISD::OR: {
+    SDValue Op0 = Src.getOperand(0);
+    SDValue Op1 = Src.getOperand(1);
+    if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
+        IsRepeatedOpOrOneUseConstant(Op0, Op1))
+      return TruncateArithmetic(Op0, Op1);
+    break;
+  }
+
+  case ISD::MUL:
+    // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
+    // better to truncate if we have the chance.
+    if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
+        !TLI.isOperationLegal(Opcode, SrcVT))
+      return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
+    LLVM_FALLTHROUGH;
+  case ISD::ADD: {
+    SDValue Op0 = Src.getOperand(0);
+    SDValue Op1 = Src.getOperand(1);
+    if (TLI.isOperationLegal(Opcode, VT) &&
+        IsRepeatedOpOrOneUseConstant(Op0, Op1))
+      return TruncateArithmetic(Op0, Op1);
+    break;
+  }
+  }
+
+  return SDValue();
+}
+
 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
 static SDValue
 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
@ -32019,6 +32096,10 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
  SDValue Src = N->getOperand(0);
  SDLoc DL(N);

+  // Attempt to pre-truncate inputs to arithmetic ops instead.
+  if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
+    return V;
+
  // Try to detect AVG pattern first.
  if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
    return Avg;
--- a/test/CodeGen/X86/avx512-any_extend_load.ll
+++ b/test/CodeGen/X86/avx512-any_extend_load.ll
@ -22,10 +22,8 @@ define void @any_extend_load_v8i64(<8 x i8> * %ptr) {
 define void @any_extend_load_v8i32(<8 x i8> * %ptr) {
 ; KNL-LABEL: any_extend_load_v8i32:
 ; KNL:       # BB#0:
-; KNL-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; KNL-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm1
-; KNL-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; KNL-NEXT:    vpmovdw %zmm0, %ymm0
+; KNL-NEXT:    vpmovzxbw {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; KNL-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
 ; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
 ; KNL-NEXT:    vmovq %xmm0, (%rdi)
 ; KNL-NEXT:    retq
--- a/test/CodeGen/X86/i64-to-float.ll
+++ b/test/CodeGen/X86/i64-to-float.ll
@ -71,34 +71,32 @@ define <2 x double> @mask_uitofp_2i64_2f64(<2 x i64> %a) nounwind {
 define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind {
 ; X32-SSE-LABEL: mask_sitofp_4i64_4f32:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    andps {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT:    andps {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X32-SSE-NEXT:    andps {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X32-SSE-NEXT:    retl
 ;
 ; X32-AVX-LABEL: mask_sitofp_4i64_4f32:
 ; X32-AVX:       # BB#0:
-; X32-AVX-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X32-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X32-AVX-NEXT:    vandps {{\.LCPI.*}}, %xmm0, %xmm0
 ; X32-AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X32-AVX-NEXT:    vzeroupper
 ; X32-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mask_sitofp_4i64_4f32:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    andps {{.*}}(%rip), %xmm1
-; X64-SSE-NEXT:    andps {{.*}}(%rip), %xmm0
 ; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-SSE-NEXT:    andps {{.*}}(%rip), %xmm0
 ; X64-SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: mask_sitofp_4i64_4f32:
 ; X64-AVX:       # BB#0:
-; X64-AVX-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
 ; X64-AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
 ; X64-AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X64-AVX-NEXT:    vzeroupper
 ; X64-AVX-NEXT:    retq
@ -110,34 +108,32 @@ define <4 x float> @mask_sitofp_4i64_4f32(<4 x i64> %a) nounwind {
 define <4 x float> @mask_uitofp_4i64_4f32(<4 x i64> %a) nounwind {
 ; X32-SSE-LABEL: mask_uitofp_4i64_4f32:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    andps {{\.LCPI.*}}, %xmm1
-; X32-SSE-NEXT:    andps {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X32-SSE-NEXT:    andps {{\.LCPI.*}}, %xmm0
 ; X32-SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X32-SSE-NEXT:    retl
 ;
 ; X32-AVX-LABEL: mask_uitofp_4i64_4f32:
 ; X32-AVX:       # BB#0:
-; X32-AVX-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
 ; X32-AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X32-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X32-AVX-NEXT:    vandps {{\.LCPI.*}}, %xmm0, %xmm0
 ; X32-AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X32-AVX-NEXT:    vzeroupper
 ; X32-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: mask_uitofp_4i64_4f32:
 ; X64-SSE:       # BB#0:
-; X64-SSE-NEXT:    andps {{.*}}(%rip), %xmm1
-; X64-SSE-NEXT:    andps {{.*}}(%rip), %xmm0
 ; X64-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-SSE-NEXT:    andps {{.*}}(%rip), %xmm0
 ; X64-SSE-NEXT:    cvtdq2ps %xmm0, %xmm0
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: mask_uitofp_4i64_4f32:
 ; X64-AVX:       # BB#0:
-; X64-AVX-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
 ; X64-AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; X64-AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X64-AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
 ; X64-AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; X64-AVX-NEXT:    vzeroupper
 ; X64-AVX-NEXT:    retq
--- a/test/CodeGen/X86/vector-trunc-math.ll
+++ b/test/CodeGen/X86/vector-trunc-math.ll