[PowerPC] fix incorrect vectorization of abs() on POWER9

Vectorized loops with abs() returns incorrect results on POWER9. This patch fixes it. For example the following code returns negative result if input values are negative though it sums up the absolute value of the inputs. int vpx_satd_c(const int16_t *coeff, int length) { int satd = 0; for (int i = 0; i < length; ++i) satd += abs(coeff[i]); return satd; } This problem causes test failures for libvpx. For vector absolute and vector absolute difference on POWER9, LLVM generates VABSDUW (Vector Absolute Difference Unsigned Word) instruction or variants. Since these instructions are for unsigned integers, we need adjustment for signed integers. For abs(sub(a, b)), we generate VABSDUW(a+0x80000000, b+0x80000000). Otherwise, abs(sub(-1, 0)) returns 0xFFFFFFFF(=-1) instead of 1. For abs(a), we generate VABSDUW(a+0x80000000, 0x80000000). Differential Revision: https://reviews.llvm.org/D45522 llvm-svn: 330497
2024-10-19 11:02:59 +02:00 · 2018-04-21 09:32:17 +00:00 · 2018-04-21 09:32:17 +00:00 · 0a8b8fc8aa
commit 0a8b8fc8aa
parent 6c495ec529
3 changed files with 112 additions and 23 deletions
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@ -327,6 +327,7 @@ private:

    bool isOffsetMultipleOf(SDNode *N, unsigned Val) const;
    void transferMemOperands(SDNode *N, SDNode *Result);
+    MachineSDNode *flipSignBit(const SDValue &N, SDNode **SignBit = nullptr);
  };

 } // end anonymous namespace
@ -3970,6 +3971,51 @@ void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) {
  cast<MachineSDNode>(Result)->setMemRefs(MemOp, MemOp + 1);
 }

+/// This method returns a node after flipping the MSB of each element
+/// of vector integer type. Additionally, if SignBitVec is non-null,
+/// this method sets a node with one at MSB of all elements
+/// and zero at other bits in SignBitVec.
+MachineSDNode *
+PPCDAGToDAGISel::flipSignBit(const SDValue &N, SDNode **SignBitVec) {
+  SDLoc dl(N);
+  EVT VecVT = N.getValueType();
+  if (VecVT == MVT::v4i32) {
+    if (SignBitVec) {
+      SDNode *ZV = CurDAG->getMachineNode(PPC::V_SET0, dl, MVT::v4i32);
+      *SignBitVec = CurDAG->getMachineNode(PPC::XVNEGSP, dl, VecVT,
+                                        SDValue(ZV, 0));
+    }
+    return CurDAG->getMachineNode(PPC::XVNEGSP, dl, VecVT, N);
+  }
+  else if (VecVT == MVT::v8i16) {
+    SDNode *Hi = CurDAG->getMachineNode(PPC::LIS, dl, MVT::i32,
+                                     getI32Imm(0x8000, dl));
+    SDNode *ScaImm = CurDAG->getMachineNode(PPC::ORI, dl, MVT::i32,
+                                         SDValue(Hi, 0),
+                                         getI32Imm(0x8000, dl));
+    SDNode *VecImm = CurDAG->getMachineNode(PPC::MTVSRWS, dl, VecVT,
+                                         SDValue(ScaImm, 0));
+    /*
+    Alternatively, we can do this as follow to use VRF instead of GPR.
+      vspltish 5, 1
+      vspltish 6, 15
+      vslh 5, 6, 5
+    */
+    if (SignBitVec) *SignBitVec = VecImm;
+    return CurDAG->getMachineNode(PPC::VADDUHM, dl, VecVT, N,
+                                  SDValue(VecImm, 0));
+  }
+  else if (VecVT == MVT::v16i8) {
+    SDNode *VecImm = CurDAG->getMachineNode(PPC::XXSPLTIB, dl, MVT::i32,
+                                         getI32Imm(0x80, dl));
+    if (SignBitVec) *SignBitVec = VecImm;
+    return CurDAG->getMachineNode(PPC::VADDUBM, dl, VecVT, N,
+                                  SDValue(VecImm, 0));
+  }
+  else
+    llvm_unreachable("Unsupported vector data type for flipSignBit");
+}
+
 // Select - Convert the specified operand from a target-independent to a
 // target-specific node if it hasn't already been changed.
 void PPCDAGToDAGISel::Select(SDNode *N) {
@ -4783,6 +4829,55 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
      return;
    }
  }
+  case ISD::ABS: {
+    assert(PPCSubTarget->hasP9Vector() && "ABS is supported with P9 Vector");
+
+    // For vector absolute difference, we use VABSDUW instruction of POWER9.
+    // Since VABSDU instructions are for unsigned integers, we need adjustment
+    // for signed integers.
+    // For abs(sub(a, b)), we generate VABSDUW(a+0x80000000, b+0x80000000).
+    // Otherwise, abs(sub(-1, 0)) returns 0xFFFFFFFF(=-1) instead of 1.
+    // For abs(a), we generate VABSDUW(a+0x80000000, 0x80000000).
+    EVT VecVT = N->getOperand(0).getValueType();
+    SDNode *AbsOp = nullptr;
+    unsigned AbsOpcode;
+
+    if (VecVT == MVT::v4i32)
+      AbsOpcode = PPC::VABSDUW;
+    else if (VecVT == MVT::v8i16)
+      AbsOpcode = PPC::VABSDUH;
+    else if (VecVT == MVT::v16i8)
+      AbsOpcode = PPC::VABSDUB;
+    else
+      llvm_unreachable("Unsupported vector data type for ISD::ABS");
+
+    // Even for signed integers, we can skip adjustment if all values are
+    // known to be positive (as signed integer) due to zero-extended inputs.
+    if (N->getOperand(0).getOpcode() == ISD::SUB &&
+        N->getOperand(0)->getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
+        N->getOperand(0)->getOperand(1).getOpcode() == ISD::ZERO_EXTEND) {
+      AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT,
+                                     SDValue(N->getOperand(0)->getOperand(0)),
+                                     SDValue(N->getOperand(0)->getOperand(1)));
+      ReplaceNode(N, AbsOp);
+      return;
+    }
+    if (N->getOperand(0).getOpcode() == ISD::SUB) {
+      SDValue SubVal = N->getOperand(0);
+      SDNode *Op0 = flipSignBit(SubVal->getOperand(0));
+      SDNode *Op1 = flipSignBit(SubVal->getOperand(1));
+      AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT,
+                                     SDValue(Op0, 0), SDValue(Op1, 0));
+    }
+    else {
+      SDNode *Op1 = nullptr;
+      SDNode *Op0 = flipSignBit(N->getOperand(0), &Op1);
+      AbsOp = CurDAG->getMachineNode(AbsOpcode, dl, VecVT, SDValue(Op0, 0),
+                                     SDValue(Op1, 0));
+    }
+    ReplaceNode(N, AbsOp);
+    return;
+  }
  }

  SelectCode(N);
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@ -1504,18 +1504,4 @@ def VABSDUW : VXForm_1<1155, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
                       "vabsduw $vD, $vA, $vB", IIC_VecGeneral,
                       [(set v4i32:$vD, (int_ppc_altivec_vabsduw v4i32:$vA, v4i32:$vB))]>;

-def : Pat<(v16i8:$vD (abs v16i8:$vA)),
-          (v16i8 (VABSDUB $vA, (v16i8 (V_SET0B))))>;
-def : Pat<(v8i16:$vD (abs v8i16:$vA)),
-          (v8i16 (VABSDUH $vA, (v8i16 (V_SET0H))))>;
-def : Pat<(v4i32:$vD (abs v4i32:$vA)),
-          (v4i32 (VABSDUW $vA, (v4i32 (V_SET0))))>;
-
-def : Pat<(v16i8:$vD (abs (sub v16i8:$vA, v16i8:$vB))),
-          (v16i8 (VABSDUB $vA, $vB))>;
-def : Pat<(v8i16:$vD (abs (sub v8i16:$vA, v8i16:$vB))),
-          (v8i16 (VABSDUH $vA, $vB))>;
-def : Pat<(v4i32:$vD (abs (sub v4i32:$vA, v4i32:$vB))),
-          (v4i32 (VABSDUW $vA, $vB))>;
-
 } // end HasP9Altivec
--- a/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
+++ b/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll
@ -9,8 +9,10 @@ entry:
  %0 = tail call <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32> %a, <4 x i32> %sub.i)
  ret <4 x i32> %0
 ; CHECK-LABEL: simple_absv_32
-; CHECK: vxor [[ZERO:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
-; CHECK-NEXT: vabsduw 2, 2, [[ZERO]]
+; CHECK-DAG: vxor {{[0-9]+}}, [[REG:[0-9]+]], [[REG]]
+; CHECK-DAG: xvnegsp 34, 34
+; CHECK-DAG: xvnegsp 35, {{[0-9]+}}
+; CHECK-NEXT: vabsduw 2, 2, {{[0-9]+}}
 ; CHECK-NEXT: blr
 ; CHECK-PWR8-LABEL: simple_absv_32
 ; CHECK-PWR8: xxlxor
@ -26,8 +28,10 @@ entry:
  %0 = tail call <4 x i32> @llvm.ppc.altivec.vmaxsw(<4 x i32> %sub.i, <4 x i32> %a)
  ret <4 x i32> %0
 ; CHECK-LABEL: simple_absv_32_swap
-; CHECK: vxor [[ZERO:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
-; CHECK-NEXT: vabsduw 2, 2, [[ZERO]]
+; CHECK-DAG: vxor {{[0-9]+}}, [[REG:[0-9]+]], [[REG]]
+; CHECK-DAG: xvnegsp 34, 34
+; CHECK-DAG: xvnegsp 35, {{[0-9]+}}
+; CHECK-NEXT: vabsduw 2, 2, {{[0-9]+}}
 ; CHECK-NEXT: blr
 ; CHECK-PWR8-LABEL: simple_absv_32_swap
 ; CHECK-PWR8: xxlxor
@ -42,8 +46,9 @@ entry:
  %0 = tail call <8 x i16> @llvm.ppc.altivec.vmaxsh(<8 x i16> %a, <8 x i16> %sub.i)
  ret <8 x i16> %0
 ; CHECK-LABEL: simple_absv_16
-; CHECK: vxor [[ZERO:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
-; CHECK-NEXT: vabsduh 2, 2, [[ZERO]]
+; CHECK: mtvsrws {{[0-9]+}}, {{[0-9]+}}
+; CHECK-NEXT: vadduhm 2, 2, [[IMM:[0-9]+]]
+; CHECK-NEXT: vabsduh 2, 2, [[IMM]]
 ; CHECK-NEXT: blr
 ; CHECK-PWR8-LABEL: simple_absv_16
 ; CHECK-PWR8: xxlxor
@ -59,8 +64,9 @@ entry:
  %0 = tail call <16 x i8> @llvm.ppc.altivec.vmaxsb(<16 x i8> %a, <16 x i8> %sub.i)
  ret <16 x i8> %0
 ; CHECK-LABEL: simple_absv_8
-; CHECK: vxor [[ZERO:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
-; CHECK-NEXT: vabsdub 2, 2, [[ZERO]]
+; CHECK: xxspltib {{[0-9]+}}, 128
+; CHECK-NEXT: vaddubm 2, 2, [[IMM:[0-9]+]]
+; CHECK-NEXT: vabsdub 2, 2, [[IMM]]
 ; CHECK-NEXT: blr
 ; CHECK-PWR8-LABEL: simple_absv_8
 ; CHECK-PWR8: xxlxor
@ -79,7 +85,9 @@ entry:
  %3 = select <4 x i1> %1, <4 x i32> %0, <4 x i32> %2
  ret <4 x i32> %3
 ; CHECK-LABEL: sub_absv_32
-; CHECK: vabsduw 2, 2, 3
+; CHECK-DAG: xvnegsp 34, 34
+; CHECK-DAG: xvnegsp 35, 35
+; CHECK-NEXT: vabsduw 2, 2, 3
 ; CHECK-NEXT: blr
 ; CHECK-PWR8-LABEL: sub_absv_32
 ; CHECK-PWR8: vsubuwm