Detecte vector reduction operations just before instruction selection.

(This is the second attemp to commit this patch, after fixing pr26652 & pr26653). This patch detects vector reductions before instruction selection. Vector reductions are vectorized reduction operations, and for such operations we have freedom to reorganize the elements of the result as long as the reduction of them stay unchanged. This will enable some reduction pattern recognition during instruction combine such as SAD/dot-product on X86. A flag is added to SDNodeFlags to mark those vector reduction nodes to be checked during instruction combine. To detect those vector reductions, we search def-use chains starting from the given instruction, and check if all uses fall into two categories: 1. Reduction with another vector. 2. Reduction on all elements. in which 2 is detected by recognizing the pattern that the loop vectorizer generates to reduce all elements in the vector outside of the loop, which includes several ShuffleVector and one ExtractElement instructions. Differential revision: http://reviews.llvm.org/D15250 llvm-svn: 261804
2024-11-23 11:13:28 +01:00 · 2016-02-24 23:40:36 +00:00 · 2016-02-24 23:40:36 +00:00 · 6af13323da
commit 6af13323da
parent 234a2e7455
4 changed files with 379 additions and 0 deletions
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@ -328,6 +328,7 @@ private:
  bool NoInfs : 1;
  bool NoSignedZeros : 1;
  bool AllowReciprocal : 1;
+  bool VectorReduction : 1;

 public:
  /// Default constructor turns off all optimization flags.
@ -340,6 +341,7 @@ public:
    NoInfs = false;
    NoSignedZeros = false;
    AllowReciprocal = false;
+    VectorReduction = false;
  }

  // These are mutators for each flag.
@ -351,6 +353,7 @@ public:
  void setNoInfs(bool b) { NoInfs = b; }
  void setNoSignedZeros(bool b) { NoSignedZeros = b; }
  void setAllowReciprocal(bool b) { AllowReciprocal = b; }
+  void setVectorReduction(bool b) { VectorReduction = b; }

  // These are accessors for each flag.
  bool hasNoUnsignedWrap() const { return NoUnsignedWrap; }
@ -361,6 +364,7 @@ public:
  bool hasNoInfs() const { return NoInfs; }
  bool hasNoSignedZeros() const { return NoSignedZeros; }
  bool hasAllowReciprocal() const { return AllowReciprocal; }
+  bool hasVectorReduction() const { return VectorReduction; }

  /// Return a raw encoding of the flags.
  /// This function should only be used to add data to the NodeID value.
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@ -2317,6 +2317,129 @@ void SelectionDAGBuilder::visitFSub(const User &I) {
  visitBinary(I, ISD::FSUB);
 }

+/// Checks if the given instruction performs a vector reduction, in which case
+/// we have the freedom to alter the elements in the result as long as the
+/// reduction of them stays unchanged.
+static bool isVectorReductionOp(const User *I) {
+  const Instruction *Inst = dyn_cast<Instruction>(I);
+  if (!Inst || !Inst->getType()->isVectorTy())
+    return false;
+
+  auto OpCode = Inst->getOpcode();
+  switch (OpCode) {
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    break;
+  case Instruction::FAdd:
+  case Instruction::FMul:
+    if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
+      if (FPOp->getFastMathFlags().unsafeAlgebra())
+        break;
+    // Fall through.
+  default:
+    return false;
+  }
+
+  unsigned ElemNum = Inst->getType()->getVectorNumElements();
+  unsigned ElemNumToReduce = ElemNum;
+
+  // Do DFS search on the def-use chain from the given instruction. We only
+  // allow four kinds of operations during the search until we reach the
+  // instruction that extracts the first element from the vector:
+  //
+  //   1. The reduction operation of the same opcode as the given instruction.
+  //
+  //   2. PHI node.
+  //
+  //   3. ShuffleVector instruction together with a reduction operation that
+  //      does a partial reduction.
+  //
+  //   4. ExtractElement that extracts the first element from the vector, and we
+  //      stop searching the def-use chain here.
+  //
+  // 3 & 4 above perform a reduction on all elements of the vector. We push defs
+  // from 1-3 to the stack to continue the DFS. The given instruction is not
+  // a reduction operation if we meet any other instructions other than those
+  // listed above.
+
+  SmallVector<const User *, 16> UsersToVisit{Inst};
+  SmallPtrSet<const User *, 16> Visited;
+  bool ReduxExtracted = false;
+
+  while (!UsersToVisit.empty()) {
+    auto User = UsersToVisit.back();
+    UsersToVisit.pop_back();
+    if (!Visited.insert(User).second)
+      continue;
+
+    for (const auto &U : User->users()) {
+      auto Inst = dyn_cast<Instruction>(U);
+      if (!Inst)
+        return false;
+
+      if (Inst->getOpcode() == OpCode || isa<PHINode>(U)) {
+        if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
+          if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().unsafeAlgebra())
+            return false;
+        UsersToVisit.push_back(U);
+      } else if (const ShuffleVectorInst *ShufInst =
+                     dyn_cast<ShuffleVectorInst>(U)) {
+        // Detect the following pattern: A ShuffleVector instruction together
+        // with a reduction that do partial reduction on the first and second
+        // ElemNumToReduce / 2 elements, and store the result in
+        // ElemNumToReduce / 2 elements in another vector.
+
+        unsigned ResultElements = ShufInst->getType()->getVectorNumElements();
+        ElemNumToReduce = ResultElements <= ElemNumToReduce ? ResultElements
+                                                            : ElemNumToReduce;
+        if (ElemNumToReduce == 1)
+          return false;
+        if (!isa<UndefValue>(U->getOperand(1)))
+          return false;
+        for (unsigned i = 0; i < ElemNumToReduce / 2; ++i)
+          if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2))
+            return false;
+        for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i)
+          if (ShufInst->getMaskValue(i) != -1)
+            return false;
+
+        // There is only one user of this ShuffleVector instruction, which
+        // must
+        // be a reduction operation.
+        if (!U->hasOneUse())
+          return false;
+
+        auto U2 = dyn_cast<Instruction>(*U->user_begin());
+        if (!U2 || U2->getOpcode() != OpCode)
+          return false;
+
+        // Check operands of the reduction operation.
+        if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) ||
+            (U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) {
+          UsersToVisit.push_back(U2);
+          ElemNumToReduce /= 2;
+        } else
+          return false;
+      } else if (isa<ExtractElementInst>(U)) {
+        // At this moment we should have reduced all elements in the vector.
+        if (ElemNumToReduce != 1)
+          return false;
+
+        const ConstantInt *Val = dyn_cast<ConstantInt>(U->getOperand(1));
+        if (!Val || Val->getZExtValue() != 0)
+          return false;
+
+        ReduxExtracted = true;
+      } else
+        return false;
+    }
+  }
+  return ReduxExtracted;
+}
+
 void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
  SDValue Op1 = getValue(I.getOperand(0));
  SDValue Op2 = getValue(I.getOperand(1));
@ -2324,6 +2447,7 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
  bool nuw = false;
  bool nsw = false;
  bool exact = false;
+  bool vec_redux = false;
  FastMathFlags FMF;

  if (const OverflowingBinaryOperator *OFBinOp =
@ -2337,10 +2461,16 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
  if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&I))
    FMF = FPOp->getFastMathFlags();

+  if (isVectorReductionOp(&I)) {
+    vec_redux = true;
+    DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n");
+  }
+
  SDNodeFlags Flags;
  Flags.setExact(exact);
  Flags.setNoSignedWrap(nsw);
  Flags.setNoUnsignedWrap(nuw);
+  Flags.setVectorReduction(vec_redux);
  if (EnableFMFInDAG) {
    Flags.setAllowReciprocal(FMF.allowReciprocal());
    Flags.setNoInfs(FMF.noInfs());
--- a/test/CodeGen/Generic/pr26652.ll
+++ b/test/CodeGen/Generic/pr26652.ll
@ -0,0 +1,8 @@
+; RUN: llc < %s
+
+define <2 x i32> @test(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %0 = or <4 x i32> %a, %b
+  %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  ret <2 x i32> %1
+}
--- a/test/CodeGen/Generic/vector-redux.ll
+++ b/test/CodeGen/Generic/vector-redux.ll
@ -0,0 +1,237 @@
+; RUN: llc < %s -debug-only=isel -o /dev/null 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+@a = global [1024 x i32] zeroinitializer, align 16
+
+define i32 @reduce_add() {
+; CHECK-LABEL: reduce_add
+; CHECK:       Detected a reduction operation: {{.*}} add
+; CHECK:       Detected a reduction operation: {{.*}} add
+; CHECK:       Detected a reduction operation: {{.*}} add
+; CHECK:       Detected a reduction operation: {{.*}} add
+; CHECK:       Detected a reduction operation: {{.*}} add
+; CHECK:       Detected a reduction operation: {{.*}} add
+; CHECK:       Detected a reduction operation: {{.*}} add
+; CHECK:       Detected a reduction operation: {{.*}} add
+; CHECK:       Detected a reduction operation: {{.*}} add
+; CHECK:       Detected a reduction operation: {{.*}} add
+; CHECK:       Detected a reduction operation: {{.*}} add
+
+min.iters.checked:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %min.iters.checked ], [ %index.next.4, %vector.body ]
+  %vec.phi = phi <4 x i32> [ zeroinitializer, %min.iters.checked ], [ %28, %vector.body ]
+  %vec.phi4 = phi <4 x i32> [ zeroinitializer, %min.iters.checked ], [ %29, %vector.body ]
+  %0 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index
+  %1 = bitcast i32* %0 to <4 x i32>*
+  %wide.load = load <4 x i32>, <4 x i32>* %1, align 16
+  %2 = getelementptr i32, i32* %0, i64 4
+  %3 = bitcast i32* %2 to <4 x i32>*
+  %wide.load5 = load <4 x i32>, <4 x i32>* %3, align 16
+  %4 = add nsw <4 x i32> %wide.load, %vec.phi
+  %5 = add nsw <4 x i32> %wide.load5, %vec.phi4
+  %index.next = add nuw nsw i64 %index, 8
+  %6 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next
+  %7 = bitcast i32* %6 to <4 x i32>*
+  %wide.load.1 = load <4 x i32>, <4 x i32>* %7, align 16
+  %8 = getelementptr i32, i32* %6, i64 4
+  %9 = bitcast i32* %8 to <4 x i32>*
+  %wide.load5.1 = load <4 x i32>, <4 x i32>* %9, align 16
+  %10 = add nsw <4 x i32> %wide.load.1, %4
+  %11 = add nsw <4 x i32> %wide.load5.1, %5
+  %index.next.1 = add nsw i64 %index, 16
+  %12 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.1
+  %13 = bitcast i32* %12 to <4 x i32>*
+  %wide.load.2 = load <4 x i32>, <4 x i32>* %13, align 16
+  %14 = getelementptr i32, i32* %12, i64 4
+  %15 = bitcast i32* %14 to <4 x i32>*
+  %wide.load5.2 = load <4 x i32>, <4 x i32>* %15, align 16
+  %16 = add nsw <4 x i32> %wide.load.2, %10
+  %17 = add nsw <4 x i32> %wide.load5.2, %11
+  %index.next.2 = add nsw i64 %index, 24
+  %18 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.2
+  %19 = bitcast i32* %18 to <4 x i32>*
+  %wide.load.3 = load <4 x i32>, <4 x i32>* %19, align 16
+  %20 = getelementptr i32, i32* %18, i64 4
+  %21 = bitcast i32* %20 to <4 x i32>*
+  %wide.load5.3 = load <4 x i32>, <4 x i32>* %21, align 16
+  %22 = add nsw <4 x i32> %wide.load.3, %16
+  %23 = add nsw <4 x i32> %wide.load5.3, %17
+  %index.next.3 = add nsw i64 %index, 32
+  %24 = getelementptr inbounds [1024 x i32], [1024 x i32]* @a, i64 0, i64 %index.next.3
+  %25 = bitcast i32* %24 to <4 x i32>*
+  %wide.load.4 = load <4 x i32>, <4 x i32>* %25, align 16
+  %26 = getelementptr i32, i32* %24, i64 4
+  %27 = bitcast i32* %26 to <4 x i32>*
+  %wide.load5.4 = load <4 x i32>, <4 x i32>* %27, align 16
+  %28 = add nsw <4 x i32> %wide.load.4, %22
+  %29 = add nsw <4 x i32> %wide.load5.4, %23
+  %index.next.4 = add nsw i64 %index, 40
+  %30 = icmp eq i64 %index.next.4, 1000
+  br i1 %30, label %middle.block, label %vector.body
+
+middle.block:
+  %.lcssa10 = phi <4 x i32> [ %29, %vector.body ]
+  %.lcssa = phi <4 x i32> [ %28, %vector.body ]
+  %bin.rdx = add <4 x i32> %.lcssa10, %.lcssa
+  %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %bin.rdx6 = add <4 x i32> %bin.rdx, %rdx.shuf
+  %rdx.shuf7 = shufflevector <4 x i32> %bin.rdx6, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx8 = add <4 x i32> %bin.rdx6, %rdx.shuf7
+  %31 = extractelement <4 x i32> %bin.rdx8, i32 0
+  ret i32 %31
+}
+
+define i32 @reduce_and() {
+; CHECK-LABEL: reduce_and
+; CHECK:       Detected a reduction operation: {{.*}} and
+; CHECK:       Detected a reduction operation: {{.*}} and
+; CHECK:       Detected a reduction operation: {{.*}} and
+; CHECK:       Detected a reduction operation: {{.*}} and
+; CHECK:       Detected a reduction operation: {{.*}} and
+; CHECK:       Detected a reduction operation: {{.*}} and
+; CHECK:       Detected a reduction operation: {{.*}} and
+; CHECK:       Detected a reduction operation: {{.*}} and
+; CHECK:       Detected a reduction operation: {{.*}} and
+
+entry:
+  br label %vector.body
+
+vector.body:
+  %lsr.iv = phi i64 [ %lsr.iv.next, %vector.body ], [ -4096, %entry ]
+  %vec.phi = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %entry ], [ %6, %vector.body ]
+  %vec.phi9 = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, %entry ], [ %7, %vector.body ]
+  %uglygep33 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
+  %uglygep3334 = bitcast i8* %uglygep33 to <4 x i32>*
+  %scevgep35 = getelementptr <4 x i32>, <4 x i32>* %uglygep3334, i64 256
+  %wide.load = load <4 x i32>, <4 x i32>* %scevgep35, align 16
+  %scevgep36 = getelementptr <4 x i32>, <4 x i32>* %uglygep3334, i64 257
+  %wide.load10 = load <4 x i32>, <4 x i32>* %scevgep36, align 16
+  %0 = and <4 x i32> %wide.load, %vec.phi
+  %1 = and <4 x i32> %wide.load10, %vec.phi9
+  %uglygep30 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
+  %uglygep3031 = bitcast i8* %uglygep30 to <4 x i32>*
+  %scevgep32 = getelementptr <4 x i32>, <4 x i32>* %uglygep3031, i64 258
+  %wide.load.1 = load <4 x i32>, <4 x i32>* %scevgep32, align 16
+  %uglygep27 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
+  %uglygep2728 = bitcast i8* %uglygep27 to <4 x i32>*
+  %scevgep29 = getelementptr <4 x i32>, <4 x i32>* %uglygep2728, i64 259
+  %wide.load10.1 = load <4 x i32>, <4 x i32>* %scevgep29, align 16
+  %2 = and <4 x i32> %wide.load.1, %0
+  %3 = and <4 x i32> %wide.load10.1, %1
+  %uglygep24 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
+  %uglygep2425 = bitcast i8* %uglygep24 to <4 x i32>*
+  %scevgep26 = getelementptr <4 x i32>, <4 x i32>* %uglygep2425, i64 260
+  %wide.load.2 = load <4 x i32>, <4 x i32>* %scevgep26, align 16
+  %uglygep21 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
+  %uglygep2122 = bitcast i8* %uglygep21 to <4 x i32>*
+  %scevgep23 = getelementptr <4 x i32>, <4 x i32>* %uglygep2122, i64 261
+  %wide.load10.2 = load <4 x i32>, <4 x i32>* %scevgep23, align 16
+  %4 = and <4 x i32> %wide.load.2, %2
+  %5 = and <4 x i32> %wide.load10.2, %3
+  %uglygep18 = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
+  %uglygep1819 = bitcast i8* %uglygep18 to <4 x i32>*
+  %scevgep20 = getelementptr <4 x i32>, <4 x i32>* %uglygep1819, i64 262
+  %wide.load.3 = load <4 x i32>, <4 x i32>* %scevgep20, align 16
+  %uglygep = getelementptr i8, i8* bitcast ([1024 x i32]* @a to i8*), i64 %lsr.iv
+  %uglygep17 = bitcast i8* %uglygep to <4 x i32>*
+  %scevgep = getelementptr <4 x i32>, <4 x i32>* %uglygep17, i64 263
+  %wide.load10.3 = load <4 x i32>, <4 x i32>* %scevgep, align 16
+  %6 = and <4 x i32> %wide.load.3, %4
+  %7 = and <4 x i32> %wide.load10.3, %5
+  %lsr.iv.next = add nsw i64 %lsr.iv, 128
+  %8 = icmp eq i64 %lsr.iv.next, 0
+  br i1 %8, label %middle.block, label %vector.body
+
+middle.block:
+  %bin.rdx = and <4 x i32> %7, %6
+  %rdx.shuf = shufflevector <4 x i32> %bin.rdx, <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %bin.rdx11 = and <4 x i32> %bin.rdx, %rdx.shuf
+  %rdx.shuf12 = shufflevector <4 x i32> %bin.rdx11, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx13 = and <4 x i32> %bin.rdx11, %rdx.shuf12
+  %9 = extractelement <4 x i32> %bin.rdx13, i32 0
+  ret i32 %9
+}
+
+define float @reduce_add_float(float* nocapture readonly %a) {
+; CHECK-LABEL: reduce_add_float
+; CHECK:       Detected a reduction operation: {{.*}} fadd fast
+; CHECK:       Detected a reduction operation: {{.*}} fadd fast
+; CHECK:       Detected a reduction operation: {{.*}} fadd fast
+; CHECK:       Detected a reduction operation: {{.*}} fadd fast
+; CHECK:       Detected a reduction operation: {{.*}} fadd fast
+; CHECK:       Detected a reduction operation: {{.*}} fadd fast
+; CHECK:       Detected a reduction operation: {{.*}} fadd fast
+; CHECK:       Detected a reduction operation: {{.*}} fadd fast
+; CHECK:       Detected a reduction operation: {{.*}} fadd fast
+; CHECK:       Detected a reduction operation: {{.*}} fadd fast
+; CHECK:       Detected a reduction operation: {{.*}} fadd fast
+;
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next.4, %vector.body ]
+  %vec.phi = phi <4 x float> [ zeroinitializer, %entry ], [ %28, %vector.body ]
+  %vec.phi9 = phi <4 x float> [ zeroinitializer, %entry ], [ %29, %vector.body ]
+  %0 = getelementptr inbounds float, float* %a, i64 %index
+  %1 = bitcast float* %0 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %1, align 4
+  %2 = getelementptr float, float* %0, i64 4
+  %3 = bitcast float* %2 to <4 x float>*
+  %wide.load10 = load <4 x float>, <4 x float>* %3, align 4
+  %4 = fadd fast <4 x float> %wide.load, %vec.phi
+  %5 = fadd fast <4 x float> %wide.load10, %vec.phi9
+  %index.next = add nuw nsw i64 %index, 8
+  %6 = getelementptr inbounds float, float* %a, i64 %index.next
+  %7 = bitcast float* %6 to <4 x float>*
+  %wide.load.1 = load <4 x float>, <4 x float>* %7, align 4
+  %8 = getelementptr float, float* %6, i64 4
+  %9 = bitcast float* %8 to <4 x float>*
+  %wide.load10.1 = load <4 x float>, <4 x float>* %9, align 4
+  %10 = fadd fast <4 x float> %wide.load.1, %4
+  %11 = fadd fast <4 x float> %wide.load10.1, %5
+  %index.next.1 = add nsw i64 %index, 16
+  %12 = getelementptr inbounds float, float* %a, i64 %index.next.1
+  %13 = bitcast float* %12 to <4 x float>*
+  %wide.load.2 = load <4 x float>, <4 x float>* %13, align 4
+  %14 = getelementptr float, float* %12, i64 4
+  %15 = bitcast float* %14 to <4 x float>*
+  %wide.load10.2 = load <4 x float>, <4 x float>* %15, align 4
+  %16 = fadd fast <4 x float> %wide.load.2, %10
+  %17 = fadd fast <4 x float> %wide.load10.2, %11
+  %index.next.2 = add nsw i64 %index, 24
+  %18 = getelementptr inbounds float, float* %a, i64 %index.next.2
+  %19 = bitcast float* %18 to <4 x float>*
+  %wide.load.3 = load <4 x float>, <4 x float>* %19, align 4
+  %20 = getelementptr float, float* %18, i64 4
+  %21 = bitcast float* %20 to <4 x float>*
+  %wide.load10.3 = load <4 x float>, <4 x float>* %21, align 4
+  %22 = fadd fast <4 x float> %wide.load.3, %16
+  %23 = fadd fast <4 x float> %wide.load10.3, %17
+  %index.next.3 = add nsw i64 %index, 32
+  %24 = getelementptr inbounds float, float* %a, i64 %index.next.3
+  %25 = bitcast float* %24 to <4 x float>*
+  %wide.load.4 = load <4 x float>, <4 x float>* %25, align 4
+  %26 = getelementptr float, float* %24, i64 4
+  %27 = bitcast float* %26 to <4 x float>*
+  %wide.load10.4 = load <4 x float>, <4 x float>* %27, align 4
+  %28 = fadd fast <4 x float> %wide.load.4, %22
+  %29 = fadd fast <4 x float> %wide.load10.4, %23
+  %index.next.4 = add nsw i64 %index, 40
+  %30 = icmp eq i64 %index.next.4, 1000
+  br i1 %30, label %middle.block, label %vector.body
+
+middle.block:
+  %.lcssa15 = phi <4 x float> [ %29, %vector.body ]
+  %.lcssa = phi <4 x float> [ %28, %vector.body ]
+  %bin.rdx = fadd fast <4 x float> %.lcssa15, %.lcssa
+  %rdx.shuf = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %bin.rdx11 = fadd fast <4 x float> %bin.rdx, %rdx.shuf
+  %rdx.shuf12 = shufflevector <4 x float> %bin.rdx11, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx13 = fadd fast <4 x float> %bin.rdx11, %rdx.shuf12
+  %31 = extractelement <4 x float> %bin.rdx13, i32 0
+  ret float %31
+}