From 12cd3e33d72dcd0250865b97c845e7f12c6127ce Mon Sep 17 00:00:00 2001 From: Leonard Chan Date: Thu, 31 Jan 2019 19:15:37 +0000 Subject: [PATCH] [Intrinsic] Expand SMULFIX to MUL, MULH[US], or [US]MUL_LOHI on vector arguments r zero scale SMULFIX, expand into MUL which produces better code for X86. For vector arguments, expand into MUL if SMULFIX is provided with a zero scale. Otherwise, expand into MULH[US] or [US]MUL_LOHI. Differential Revision: https://reviews.llvm.org/D56987 llvm-svn: 352783 --- .../SelectionDAG/LegalizeVectorOps.cpp | 9 ++ lib/CodeGen/SelectionDAG/TargetLowering.cpp | 26 +++-- test/CodeGen/X86/smul_fix.ll | 100 +++++------------- 3 files changed, 49 insertions(+), 86 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp index 7730db1dfce..6ff288c8976 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp @@ -141,6 +141,7 @@ class VectorLegalizer { SDValue ExpandROT(SDValue Op); SDValue ExpandFMINNUM_FMAXNUM(SDValue Op); SDValue ExpandAddSubSat(SDValue Op); + SDValue ExpandFixedPointMul(SDValue Op); SDValue ExpandStrictFPOp(SDValue Op); /// Implements vector promotion. @@ -782,6 +783,8 @@ SDValue VectorLegalizer::Expand(SDValue Op) { case ISD::UADDSAT: case ISD::SADDSAT: return ExpandAddSubSat(Op); + case ISD::SMULFIX: + return ExpandFixedPointMul(Op); case ISD::STRICT_FADD: case ISD::STRICT_FSUB: case ISD::STRICT_FMUL: @@ -1217,6 +1220,12 @@ SDValue VectorLegalizer::ExpandAddSubSat(SDValue Op) { return DAG.UnrollVectorOp(Op.getNode()); } +SDValue VectorLegalizer::ExpandFixedPointMul(SDValue Op) { + if (SDValue Expanded = TLI.expandFixedPointMul(Op.getNode(), DAG)) + return Expanded; + return DAG.UnrollVectorOp(Op.getNode()); +} + SDValue VectorLegalizer::ExpandStrictFPOp(SDValue Op) { EVT VT = Op.getValueType(); EVT EltVT = VT.getVectorElementType(); diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp index c910a845ac2..e759089aa43 100644 --- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -5362,29 +5362,25 @@ SDValue TargetLowering::expandAddSubSat(SDNode *Node, SelectionDAG &DAG) const { SDValue TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const { assert(Node->getOpcode() == ISD::SMULFIX && "Expected opcode to be SMULFIX."); - assert(Node->getNumOperands() == 3 && - "Expected signed fixed point multiplication to have 3 operands."); SDLoc dl(Node); SDValue LHS = Node->getOperand(0); SDValue RHS = Node->getOperand(1); - assert(LHS.getValueType().isScalarInteger() && - "Expected operands to be integers. Vector of int arguments should " - "already be unrolled."); - assert(RHS.getValueType().isScalarInteger() && - "Expected operands to be integers. Vector of int arguments should " - "already be unrolled."); + EVT VT = LHS.getValueType(); + unsigned Scale = Node->getConstantOperandVal(2); + + // [us]mul.fix(a, b, 0) -> mul(a, b) + if (!Scale) { + if (VT.isVector() && !isOperationLegalOrCustom(ISD::MUL, VT)) + return SDValue(); + return DAG.getNode(ISD::MUL, dl, VT, LHS, RHS); + } + assert(LHS.getValueType() == RHS.getValueType() && "Expected both operands to be the same type"); - - unsigned Scale = Node->getConstantOperandVal(2); - EVT VT = LHS.getValueType(); assert(Scale < VT.getScalarSizeInBits() && "Expected scale to be less than the number of bits."); - if (!Scale) - return DAG.getNode(ISD::MUL, dl, VT, LHS, RHS); - // Get the upper and lower bits of the result. SDValue Lo, Hi; if (isOperationLegalOrCustom(ISD::SMUL_LOHI, VT)) { @@ -5395,6 +5391,8 @@ TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const { } else if (isOperationLegalOrCustom(ISD::MULHS, VT)) { Lo = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS); Hi = DAG.getNode(ISD::MULHS, dl, VT, LHS, RHS); + } else if (VT.isVector()) { + return SDValue(); } else { report_fatal_error("Unable to expand signed fixed point multiplication."); } diff --git a/test/CodeGen/X86/smul_fix.ll b/test/CodeGen/X86/smul_fix.ll index 2e69a2666f9..30db0d3b4d6 100644 --- a/test/CodeGen/X86/smul_fix.ll +++ b/test/CodeGen/X86/smul_fix.ll @@ -135,52 +135,27 @@ define i4 @func3(i4 %x, i4 %y) nounwind { define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-LABEL: vec: ; X64: # %bb.0: -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] -; X64-NEXT: movd %xmm2, %eax -; X64-NEXT: cltq -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] -; X64-NEXT: movd %xmm2, %ecx -; X64-NEXT: movslq %ecx, %rcx -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: shrq $32, %rax -; X64-NEXT: shldl $30, %ecx, %eax -; X64-NEXT: movd %eax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; X64-NEXT: movd %xmm3, %eax -; X64-NEXT: cltq -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; X64-NEXT: movd %xmm3, %ecx -; X64-NEXT: movslq %ecx, %rcx -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: shrq $32, %rax -; X64-NEXT: shldl $30, %ecx, %eax -; X64-NEXT: movd %eax, %xmm3 -; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X64-NEXT: movd %xmm1, %eax -; X64-NEXT: cltq -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: movslq %ecx, %rcx -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: shrq $32, %rax -; X64-NEXT: shldl $30, %ecx, %eax -; X64-NEXT: movd %eax, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; X64-NEXT: movd %xmm1, %eax -; X64-NEXT: cltq -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: movslq %ecx, %rcx -; X64-NEXT: imulq %rax, %rcx -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: shrq $32, %rax -; X64-NEXT: shldl $30, %ecx, %eax -; X64-NEXT: movd %eax, %xmm0 -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: pxor %xmm2, %xmm2 +; X64-NEXT: pxor %xmm3, %xmm3 +; X64-NEXT: pcmpgtd %xmm1, %xmm3 +; X64-NEXT: pand %xmm0, %xmm3 +; X64-NEXT: pcmpgtd %xmm0, %xmm2 +; X64-NEXT: pand %xmm1, %xmm2 +; X64-NEXT: paddd %xmm3, %xmm2 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3] +; X64-NEXT: pmuludq %xmm1, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,3,2,3] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X64-NEXT: pmuludq %xmm3, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3] +; X64-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; X64-NEXT: psubd %xmm2, %xmm4 +; X64-NEXT: pslld $30, %xmm4 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; X64-NEXT: psrld $2, %xmm0 +; X64-NEXT: por %xmm4, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: vec: @@ -295,32 +270,13 @@ define i4 @func6(i4 %x, i4 %y) nounwind { define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-LABEL: vec2: ; X64: # %bb.0: -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,1,2,3] -; X64-NEXT: movd %xmm2, %eax -; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3] -; X64-NEXT: movd %xmm2, %ecx -; X64-NEXT: imull %eax, %ecx -; X64-NEXT: movd %ecx, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1] -; X64-NEXT: movd %xmm3, %eax -; X64-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; X64-NEXT: movd %xmm3, %ecx -; X64-NEXT: imull %eax, %ecx -; X64-NEXT: movd %ecx, %xmm3 -; X64-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; X64-NEXT: movd %xmm1, %eax -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: imull %eax, %ecx -; X64-NEXT: movd %ecx, %xmm2 -; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3] -; X64-NEXT: movd %xmm1, %eax -; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; X64-NEXT: movd %xmm0, %ecx -; X64-NEXT: imull %eax, %ecx -; X64-NEXT: movd %ecx, %xmm0 -; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; X64-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; X64-NEXT: movdqa %xmm2, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3] +; X64-NEXT: pmuludq %xmm1, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; X64-NEXT: pmuludq %xmm2, %xmm1 +; X64-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X64-NEXT: retq ; ; X86-LABEL: vec2: