[PPC][DAGCombine] Convert SETCC to subtract when the result is zero extended

When we see a SETCC whose only users are zero extend operations, we can replace it with a subtraction. This results in doing all calculations in GPRs and avoids CR use. Currently we do this only for ULT, ULE, UGT and UGE condition codes. There are ways that this can be extended. For example for signed condition codes. In that case we will be introducing additional sign extend instructions, so more careful profitability analysis may be required. Another direction to extend this is for equal, not equal conditions. Also when users of SETCC are any_ext or sign_ext, we might be able to do something similar. llvm-svn: 287329
2024-11-23 11:13:28 +01:00 · 2016-11-18 10:41:44 +00:00 · 2016-11-18 10:41:44 +00:00 · 852aec9243
commit 852aec9243
parent 9416bb4ccb
3 changed files with 184 additions and 1 deletions
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@ -9976,6 +9976,87 @@ static bool findConsecutiveLoad(LoadSDNode *LD, SelectionDAG &DAG) {
  return false;
 }

+
+/// This function is called when we have proved that a SETCC node can be replaced
+/// by subtraction (and other supporting instructions) so that the result of
+/// comparison is kept in a GPR instead of CR. This function is purely for
+/// codegen purposes and has some flags to guide the codegen process.
+static SDValue generateEquivalentSub(SDNode *N, int Size, bool Complement,
+                                     bool Swap, SDLoc &DL, SelectionDAG &DAG) {
+
+  assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
+
+  // Zero extend the operands to the largest legal integer. Originally, they
+  // must be of a strictly smaller size.
+  auto Op0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(0),
+                         DAG.getConstant(Size, DL, MVT::i32));
+  auto Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, N->getOperand(1),
+                         DAG.getConstant(Size, DL, MVT::i32));
+
+  // Swap if needed. Depends on the condition code.
+  if (Swap)
+    std::swap(Op0, Op1);
+
+  // Subtract extended integers.
+  auto SubNode = DAG.getNode(ISD::SUB, DL, MVT::i64, Op0, Op1);
+
+  // Move the sign bit to the least significant position and zero out the rest.
+  // Now the least significant bit carries the result of original comparison.
+  auto Shifted = DAG.getNode(ISD::SRL, DL, MVT::i64, SubNode,
+                             DAG.getConstant(Size - 1, DL, MVT::i32));
+  auto Final = Shifted;
+
+  // Complement the result if needed. Based on the condition code.
+  if (Complement)
+    Final = DAG.getNode(ISD::XOR, DL, MVT::i64, Shifted,
+                        DAG.getConstant(1, DL, MVT::i64));
+
+  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Final);
+}
+
+SDValue PPCTargetLowering::ConvertSETCCToSubtract(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+
+  assert(N->getOpcode() == ISD::SETCC && "ISD::SETCC Expected.");
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  // Size of integers being compared has a critical role in the following
+  // analysis, so we prefer to do this when all types are legal.
+  if (!DCI.isAfterLegalizeVectorOps())
+    return SDValue();
+
+  // If all users of SETCC extend its value to a legal integer type
+  // then we replace SETCC with a subtraction
+  for (SDNode::use_iterator UI = N->use_begin(),
+       UE = N->use_end(); UI != UE; ++UI) {
+    if (UI->getOpcode() != ISD::ZERO_EXTEND)
+      return SDValue();
+  }
+
+  ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  auto OpSize = N->getOperand(0).getValueSizeInBits();
+
+  unsigned Size = DAG.getDataLayout().getLargestLegalIntTypeSizeInBits();
+
+  if (OpSize < Size) {
+    switch (CC) {
+    default: break;
+    case ISD::SETULT:
+      return generateEquivalentSub(N, Size, false, false, DL, DAG);
+    case ISD::SETULE:
+      return generateEquivalentSub(N, Size, true, true, DL, DAG);
+    case ISD::SETUGT:
+      return generateEquivalentSub(N, Size, false, true, DL, DAG);
+    case ISD::SETUGE:
+      return generateEquivalentSub(N, Size, true, false, DL, DAG);
+    }
+  }
+
+  return SDValue();
+}
+
 SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
  SelectionDAG &DAG = DCI.DAG;
@ -10017,7 +10098,8 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
                                 APInt::getHighBitsSet(OpBits, OpBits-1)) ||
          !DAG.MaskedValueIsZero(N->getOperand(1),
                                 APInt::getHighBitsSet(OpBits, OpBits-1)))
-        return SDValue();
+        return (N->getOpcode() == ISD::SETCC ? ConvertSETCCToSubtract(N, DCI)
+                                             : SDValue());
    } else {
      // This is neither a signed nor an unsigned comparison, just make sure
      // that the high bits are equal.
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@ -977,6 +977,11 @@ namespace llvm {
    SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const;
    SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const;

+    /// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces
+    /// SETCC with integer subtraction when (1) there is a legal way of doing it
+    /// (2) keeping the result of comparison in GPR has performance benefit.
+    SDValue ConvertSETCCToSubtract(SDNode *N, DAGCombinerInfo &DCI) const;
+
    SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
                            int &RefinementSteps, bool &UseOneConstNR,
                            bool Reciprocal) const override;
--- a/test/CodeGen/PowerPC/setcc-to-sub.ll
+++ b/test/CodeGen/PowerPC/setcc-to-sub.ll
@ -0,0 +1,96 @@
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
+; RUN: -mcpu=pwr8 < %s | FileCheck %s
+
+%class.PB2 = type { [1 x i32], %class.PB1* }
+%class.PB1 = type { [1 x i32], i64, i64, i32 }
+
+; Function Attrs: norecurse nounwind readonly
+define zeroext i1 @test1(%class.PB2* %s_a, %class.PB2* %s_b) local_unnamed_addr #0 {
+entry:
+  %arrayidx.i6 = bitcast %class.PB2* %s_a to i32*
+  %0 = load i32, i32* %arrayidx.i6, align 8, !tbaa !1
+  %and.i = and i32 %0, 8
+  %arrayidx.i37 = bitcast %class.PB2* %s_b to i32*
+  %1 = load i32, i32* %arrayidx.i37, align 8, !tbaa !1
+  %and.i4 = and i32 %1, 8
+  %cmp.i5 = icmp ult i32 %and.i, %and.i4
+  ret i1 %cmp.i5
+
+; CHECK-LABEL: @test1
+; CHECK: rlwinm [[REG1:[0-9]*]]
+; CHECK-NEXT: rlwinm [[REG2:[0-9]*]]
+; CHECK-NEXT: sub [[REG3:[0-9]*]], [[REG1]], [[REG2]]
+; CHECK-NEXT: rldicl 3, [[REG3]]
+; CHECK: blr
+
+}
+
+; Function Attrs: norecurse nounwind readonly
+define zeroext i1 @test2(%class.PB2* %s_a, %class.PB2* %s_b) local_unnamed_addr #0 {
+entry:
+  %arrayidx.i6 = bitcast %class.PB2* %s_a to i32*
+  %0 = load i32, i32* %arrayidx.i6, align 8, !tbaa !1
+  %and.i = and i32 %0, 8
+  %arrayidx.i37 = bitcast %class.PB2* %s_b to i32*
+  %1 = load i32, i32* %arrayidx.i37, align 8, !tbaa !1
+  %and.i4 = and i32 %1, 8
+  %cmp.i5 = icmp ule i32 %and.i, %and.i4
+  ret i1 %cmp.i5
+
+; CHECK-LABEL: @test2
+; CHECK: rlwinm [[REG1:[0-9]*]]
+; CHECK-NEXT: rlwinm [[REG2:[0-9]*]]
+; CHECK-NEXT: sub [[REG3:[0-9]*]], [[REG2]], [[REG1]]
+; CHECK-NEXT: rldicl [[REG4:[0-9]*]], [[REG3]]
+; CHECK-NEXT: xori 3, [[REG4]], 1
+; CHECK: blr
+
+}
+
+; Function Attrs: norecurse nounwind readonly
+define zeroext i1 @test3(%class.PB2* %s_a, %class.PB2* %s_b) local_unnamed_addr #0 {
+entry:
+  %arrayidx.i6 = bitcast %class.PB2* %s_a to i32*
+  %0 = load i32, i32* %arrayidx.i6, align 8, !tbaa !1
+  %and.i = and i32 %0, 8
+  %arrayidx.i37 = bitcast %class.PB2* %s_b to i32*
+  %1 = load i32, i32* %arrayidx.i37, align 8, !tbaa !1
+  %and.i4 = and i32 %1, 8
+  %cmp.i5 = icmp ugt i32 %and.i, %and.i4
+  ret i1 %cmp.i5
+
+; CHECK-LABEL: @test3
+; CHECK: rlwinm [[REG1:[0-9]*]]
+; CHECK-NEXT: rlwinm [[REG2:[0-9]*]]
+; CHECK-NEXT: sub [[REG3:[0-9]*]], [[REG2]], [[REG1]]
+; CHECK-NEXT: rldicl 3, [[REG3]]
+; CHECK: blr
+
+}
+
+; Function Attrs: norecurse nounwind readonly
+define zeroext i1 @test4(%class.PB2* %s_a, %class.PB2* %s_b) local_unnamed_addr #0 {
+entry:
+  %arrayidx.i6 = bitcast %class.PB2* %s_a to i32*
+  %0 = load i32, i32* %arrayidx.i6, align 8, !tbaa !1
+  %and.i = and i32 %0, 8
+  %arrayidx.i37 = bitcast %class.PB2* %s_b to i32*
+  %1 = load i32, i32* %arrayidx.i37, align 8, !tbaa !1
+  %and.i4 = and i32 %1, 8
+  %cmp.i5 = icmp uge i32 %and.i, %and.i4
+  ret i1 %cmp.i5
+
+; CHECK-LABEL: @test4
+; CHECK: rlwinm [[REG1:[0-9]*]]
+; CHECK-NEXT: rlwinm [[REG2:[0-9]*]]
+; CHECK-NEXT: sub [[REG3:[0-9]*]], [[REG1]], [[REG2]]
+; CHECK-NEXT: rldicl [[REG4:[0-9]*]], [[REG3]]
+; CHECK-NEXT: xori 3, [[REG4]], 1
+; CHECK: blr
+
+}
+
+!1 = !{!2, !2, i64 0}
+!2 = !{!"int", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C++ TBAA"}