[SLP] Fix for PR32164: Improve vectorization of reverse order of extract operations.

Summary: Sometimes vectorization of insertelement instructions with extractelement operands may produce an extra shuffle operation, if these operands are in the reverse order. Patch tries to improve this situation by the reordering of the operands to remove this extra shuffle operation. Reviewers: mkuper, hfinkel, RKSimon, spatel Subscribers: mzolotukhin, llvm-commits Differential Revision: https://reviews.llvm.org/D33954 llvm-svn: 322579
2024-11-24 03:33:20 +01:00 · 2018-01-16 18:17:01 +00:00 · 2018-01-16 18:17:01 +00:00 · 2364802067
commit 2364802067
parent 3c09cf2a76
2 changed files with 33 additions and 30 deletions
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@ -585,8 +585,7 @@ public:
    ScalarToTreeEntry.clear();
    MustGather.clear();
    ExternalUses.clear();
-    NumLoadsWantToKeepOrder = 0;
-    NumLoadsWantToChangeOrder = 0;
+    NumOpsWantToKeepOrder.clear();
    for (auto &Iter : BlocksSchedules) {
      BlockScheduling *BS = Iter.second.get();
      BS->clear();
@ -601,7 +600,12 @@ public:

  /// \returns true if it is beneficial to reverse the vector order.
  bool shouldReorder() const {
-    return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
+    return std::accumulate(
+               NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(), 0,
+               [](int Val1,
+                  const decltype(NumOpsWantToKeepOrder)::value_type &Val2) {
+                 return Val1 + (Val2.second < 0 ? 1 : -1);
+               }) > 0;
  }

  /// \return The vector element size in bits to use when vectorizing the
@ -1201,11 +1205,10 @@ private:
  /// List of users to ignore during scheduling and that don't need extracting.
  ArrayRef<Value *> UserIgnoreList;

-  // Number of load bundles that contain consecutive loads.
-  int NumLoadsWantToKeepOrder = 0;
-
-  // Number of load bundles that contain consecutive loads in reversed order.
-  int NumLoadsWantToChangeOrder = 0;
+  /// Number of operation bundles that contain consecutive operations - number
+  /// of operation bundles that contain consecutive operations in reversed
+  /// order.
+  DenseMap<unsigned, int> NumOpsWantToKeepOrder;

  // Analysis and block reference.
  Function *F;
@ -1543,7 +1546,11 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
      bool Reuse = canReuseExtract(VL, VL0);
      if (Reuse) {
        DEBUG(dbgs() << "SLP: Reusing extract sequence.\n");
+        ++NumOpsWantToKeepOrder[S.Opcode];
      } else {
+        SmallVector<Value *, 4> ReverseVL(VL.rbegin(), VL.rend());
+        if (canReuseExtract(ReverseVL, VL0))
+          --NumOpsWantToKeepOrder[S.Opcode];
        BS.cancelScheduling(VL, VL0);
      }
      newTreeEntry(VL, Reuse, UserTreeIdx);
@ -1593,7 +1600,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
      }

      if (Consecutive) {
-        ++NumLoadsWantToKeepOrder;
+        ++NumOpsWantToKeepOrder[S.Opcode];
        newTreeEntry(VL, true, UserTreeIdx);
        DEBUG(dbgs() << "SLP: added a vector of loads.\n");
        return;
@ -1612,7 +1619,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
      newTreeEntry(VL, false, UserTreeIdx);

      if (ReverseConsecutive) {
-        ++NumLoadsWantToChangeOrder;
+        --NumOpsWantToKeepOrder[S.Opcode];
        DEBUG(dbgs() << "SLP: Gathering reversed loads.\n");
      } else {
        DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
--- a/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll
+++ b/test/Transforms/SLPVectorizer/X86/reverse_extract_elements.ll
@ -5,13 +5,12 @@ define float @dotf(<4 x float> %x, <4 x float> %y) {
 ; CHECK-LABEL: @dotf(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = fmul fast <4 x float> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP0]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
-; CHECK-NEXT:    ret float [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    ret float [[TMP1]]
 ;
 entry:
  %vecext = extractelement <4 x float> %x, i32 0
@ -38,13 +37,12 @@ define double @dotd(<4 x double>* byval nocapture readonly align 32, <4 x double
 ; CHECK-NEXT:    [[X:%.*]] = load <4 x double>, <4 x double>* [[TMP0:%.*]], align 32
 ; CHECK-NEXT:    [[Y:%.*]] = load <4 x double>, <4 x double>* [[TMP1:%.*]], align 32
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> [[X]], [[Y]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP2]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[BIN_RDX]], <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x double> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0
-; CHECK-NEXT:    ret double [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    ret double [[TMP3]]
 ;
 entry:
  %x = load <4 x double>, <4 x double>* %0, align 32
@ -73,13 +71,12 @@ define float @dotfq(<4 x float>* nocapture readonly %x, <4 x float>* nocapture r
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, <4 x float>* [[X:%.*]], align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[Y:%.*]], align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x float> [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP2]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
-; CHECK-NEXT:    ret float [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    ret float [[TMP3]]
 ;
 entry:
  %0 = load <4 x float>, <4 x float>* %x, align 16
@ -108,13 +105,12 @@ define double @dotdq(<4 x double>* nocapture readonly %x, <4 x double>* nocaptur
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x double>, <4 x double>* [[X:%.*]], align 32
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x double>, <4 x double>* [[Y:%.*]], align 32
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> [[TMP1]], [[TMP0]]
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x double> [[TMP2]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x double> [[BIN_RDX]], <4 x double> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x double> [[BIN_RDX]], [[RDX_SHUF1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0
-; CHECK-NEXT:    ret double [[TMP4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <4 x double> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    ret double [[TMP3]]
 ;
 entry:
  %0 = load <4 x double>, <4 x double>* %x, align 32