Split the store of a wide value merged from an int-fp pair into multiple stores.

For the store of a wide value merged from a pair of values, especially int-fp pair, sometimes it is more efficent to split it into separate narrow stores, which can remove the bitwise instructions or sink them to colder places. Now the feature is only enabled on x86 target, and only store of int-fp pair is splitted. It is possible that the application scope gets extended with perf evidence support in the future. Differential Revision: https://reviews.llvm.org/D22840 llvm-svn: 280505
2024-10-19 11:02:59 +02:00 · 2016-09-02 17:17:04 +00:00 · 2016-09-02 17:17:04 +00:00 · 6f063606d0
commit 6f063606d0
parent a2a2ac53eb
4 changed files with 191 additions and 0 deletions
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@ -335,6 +335,12 @@ public:
    return false;
  }

+  /// \brief Return true if it is cheaper to split the store of a merged int val
+  /// from a pair of smaller values into multiple stores.
+  virtual bool isMultiStoresCheaperThanBitsMerge(SDValue Lo, SDValue Hi) const {
+    return false;
+  }
+
  /// \brief Return if the target supports combining a
  /// chain like:
  /// \code
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@ -374,6 +374,7 @@ namespace {
    SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
    SDValue ReduceLoadWidth(SDNode *N);
    SDValue ReduceLoadOpStoreWidth(SDNode *N);
+    SDValue splitMergedValStore(StoreSDNode *ST);
    SDValue TransformFPLoadStorePair(SDNode *N);
    SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
    SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);
@ -12200,9 +12201,111 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
      return NewSt;
  }

+  if (SDValue NewSt = splitMergedValStore(ST))
+    return NewSt;
+
  return ReduceLoadOpStoreWidth(N);
 }

+/// For the instruction sequence of store below, F and I values
+/// are bundled together as an i64 value before being stored into memory.
+/// Sometimes it is more efficent to generate separate stores for F and I,
+/// which can remove the bitwise instructions or sink them to colder places.
+///
+///   (store (or (zext (bitcast F to i32) to i64),
+///              (shl (zext I to i64), 32)), addr)  -->
+///   (store F, addr) and (store I, addr+4)
+///
+/// Similarly, splitting for other merged store can also be beneficial, like:
+/// For pair of {i32, i32}, i64 store --> two i32 stores.
+/// For pair of {i32, i16}, i64 store --> two i32 stores.
+/// For pair of {i16, i16}, i32 store --> two i16 stores.
+/// For pair of {i16, i8},  i32 store --> two i16 stores.
+/// For pair of {i8, i8},   i16 store --> two i8 stores.
+///
+/// We allow each target to determine specifically which kind of splitting is
+/// supported.
+///
+/// The store patterns are commonly seen from the simple code snippet below
+/// if only std::make_pair(...) is sroa transformed before inlined into hoo.
+///   void goo(const std::pair<int, float> &);
+///   hoo() {
+///     ...
+///     goo(std::make_pair(tmp, ftmp));
+///     ...
+///   }
+///
+SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
+  if (OptLevel == CodeGenOpt::None)
+    return SDValue();
+
+  SDValue Val = ST->getValue();
+  SDLoc DL(ST);
+
+  // Match OR operand.
+  if (!Val.getValueType().isScalarInteger() || Val.getOpcode() != ISD::OR)
+    return SDValue();
+
+  // Match SHL operand and get Lower and Higher parts of Val.
+  SDValue Op1 = Val.getOperand(0);
+  SDValue Op2 = Val.getOperand(1);
+  SDValue Lo, Hi;
+  if (Op1.getOpcode() != ISD::SHL) {
+    std::swap(Op1, Op2);
+    if (Op1.getOpcode() != ISD::SHL)
+      return SDValue();
+  }
+  Lo = Op2;
+  Hi = Op1.getOperand(0);
+  if (!Op1.hasOneUse())
+    return SDValue();
+
+  // Match shift amount to HalfValBitSize.
+  unsigned HalfValBitSize = Val.getValueType().getSizeInBits() / 2;
+  ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(Op1.getOperand(1));
+  if (!ShAmt || ShAmt->getAPIntValue() != HalfValBitSize)
+    return SDValue();
+
+  // Lo and Hi are zero-extended from int with size less equal than 32
+  // to i64.
+  if (Lo.getOpcode() != ISD::ZERO_EXTEND || !Lo.hasOneUse() ||
+      !Lo.getOperand(0).getValueType().isScalarInteger() ||
+      Lo.getOperand(0).getValueType().getSizeInBits() > HalfValBitSize ||
+      Hi.getOpcode() != ISD::ZERO_EXTEND || !Hi.hasOneUse() ||
+      !Hi.getOperand(0).getValueType().isScalarInteger() ||
+      Hi.getOperand(0).getValueType().getSizeInBits() > HalfValBitSize)
+    return SDValue();
+
+  if (!TLI.isMultiStoresCheaperThanBitsMerge(Lo.getOperand(0),
+                                             Hi.getOperand(0)))
+    return SDValue();
+
+  // Start to split store.
+  unsigned Alignment = ST->getAlignment();
+  MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = ST->getAAInfo();
+
+  // Change the sizes of Lo and Hi's value types to HalfValBitSize.
+  EVT VT = EVT::getIntegerVT(*DAG.getContext(), HalfValBitSize);
+  Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo.getOperand(0));
+  Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi.getOperand(0));
+
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+  // Lower value store.
+  SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
+                             ST->getAlignment(), MMOFlags, AAInfo);
+  Ptr =
+      DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                  DAG.getConstant(HalfValBitSize / 8, DL, Ptr.getValueType()));
+  // Higher value store.
+  SDValue St1 =
+      DAG.getStore(Chain, DL, Hi, Ptr,
+                   ST->getPointerInfo().getWithOffset(HalfValBitSize / 8),
+                   Alignment / 2, MMOFlags, AAInfo);
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, St0, St1);
+}
+
 SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
  SDValue InVec = N->getOperand(0);
  SDValue InVal = N->getOperand(1);
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@ -764,6 +764,26 @@ namespace llvm {
      return VT == MVT::f32 || VT == MVT::f64 || VT.isVector();
    }

+    bool isMultiStoresCheaperThanBitsMerge(SDValue Lo,
+                                           SDValue Hi) const override {
+      // If the pair to store is a mixture of float and int values, we will
+      // save two bitwise instructions and one float-to-int instruction and
+      // increase one store instruction. There is potentially a more
+      // significant benefit because it avoids the float->int domain switch
+      // for input value. So It is more likely a win.
+      if (Lo.getOpcode() == ISD::BITCAST || Hi.getOpcode() == ISD::BITCAST) {
+        SDValue Opd = (Lo.getOpcode() == ISD::BITCAST) ? Lo.getOperand(0)
+                                                       : Hi.getOperand(0);
+        if (Opd.getValueType().isFloatingPoint())
+          return true;
+      }
+      // If the pair only contains int values, we will save two bitwise
+      // instructions and increase one store instruction (costing one more
+      // store buffer). Since the benefit is more blurred so we leave
+      // such pair out until we get testcase to prove it is a win.
+      return false;
+    }
+
    bool hasAndNotCompare(SDValue Y) const override;

    /// Return the value type to use for ISD::SETCC.
--- a/test/Transforms/InstCombine/split-store.ll
+++ b/test/Transforms/InstCombine/split-store.ll
@ -0,0 +1,62 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: int32_float_pair
+; CHECK: movss %xmm0, 4(%rsi)
+; CHECK: movl %edi, (%rsi)
+define void @int32_float_pair(i32 %tmp1, float %tmp2, i64* %ref.tmp) {
+entry:
+  %t0 = bitcast float %tmp2 to i32
+  %t1 = zext i32 %t0 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i32 %tmp1 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  ret void
+}
+
+; CHECK-LABEL: float_int32_pair
+; CHECK: movl %edi, 4(%rsi)
+; CHECK: movss %xmm0, (%rsi)
+define void @float_int32_pair(float %tmp1, i32 %tmp2, i64* %ref.tmp) {
+entry:
+  %t0 = bitcast float %tmp1 to i32
+  %t1 = zext i32 %tmp2 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i32 %t0 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  ret void
+}
+
+; CHECK-LABEL: int16_float_pair
+; CHECK: movss %xmm0, 4(%rsi)
+; CHECK: movzwl	%di, %eax
+; CHECK: movl %eax, (%rsi)
+define void @int16_float_pair(i16 signext %tmp1, float %tmp2, i64* %ref.tmp) {
+entry:
+  %t0 = bitcast float %tmp2 to i32
+  %t1 = zext i32 %t0 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i16 %tmp1 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  ret void
+}
+
+; CHECK-LABEL: int8_float_pair
+; CHECK: movss %xmm0, 4(%rsi)
+; CHECK: movzbl	%dil, %eax
+; CHECK: movl %eax, (%rsi)
+define void @int8_float_pair(i8 signext %tmp1, float %tmp2, i64* %ref.tmp) {
+entry:
+  %t0 = bitcast float %tmp2 to i32
+  %t1 = zext i32 %t0 to i64
+  %t2 = shl nuw i64 %t1, 32
+  %t3 = zext i8 %tmp1 to i64
+  %t4 = or i64 %t2, %t3
+  store i64 %t4, i64* %ref.tmp, align 8
+  ret void
+}