diff --git a/lib/Transforms/Scalar/LoopInterchange.cpp b/lib/Transforms/Scalar/LoopInterchange.cpp index cf783e947a1..973194482ab 100644 --- a/lib/Transforms/Scalar/LoopInterchange.cpp +++ b/lib/Transforms/Scalar/LoopInterchange.cpp @@ -625,6 +625,13 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) { containsUnsafeInstructions(OuterLoopLatch)) return false; + // Also make sure the inner loop preheader does not contain any unsafe + // instructions. Note that all instructions in the preheader will be moved to + // the outer loop header when interchanging. + if (InnerLoopPreHeader != OuterLoopHeader && + containsUnsafeInstructions(InnerLoopPreHeader)) + return false; + LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n"); // We have a perfect loop nest. return true; @@ -1306,6 +1313,21 @@ bool LoopInterchangeTransform::transform() { LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n"); } + // Instructions in the original inner loop preheader may depend on values + // defined in the outer loop header. Move them there, because the original + // inner loop preheader will become the entry into the interchanged loop nest. + // Currently we move all instructions and rely on LICM to move invariant + // instructions outside the loop nest. + BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader(); + BasicBlock *OuterLoopHeader = OuterLoop->getHeader(); + if (InnerLoopPreHeader != OuterLoopHeader) { + SmallPtrSet NeedsMoving; + for (Instruction &I : + make_early_inc_range(make_range(InnerLoopPreHeader->begin(), + std::prev(InnerLoopPreHeader->end())))) + I.moveBefore(OuterLoopHeader->getTerminator()); + } + Transformed |= adjustLoopLinks(); if (!Transformed) { LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n"); diff --git a/test/Transforms/LoopInterchange/lcssa-preheader.ll b/test/Transforms/LoopInterchange/lcssa-preheader.ll index 57cabfc9bed..3205a546581 100644 --- a/test/Transforms/LoopInterchange/lcssa-preheader.ll +++ b/test/Transforms/LoopInterchange/lcssa-preheader.ll @@ -20,11 +20,11 @@ define void @lcssa_08(i32 %n, i32 %m) { ; CHECK-NEXT: [[CMP24:%.*]] = icmp sgt i32 [[N:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP24]], label [[INNER_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] ; CHECK: outer.preheader: -; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[M:%.*]] to i64 ; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] ; CHECK: outer.header: ; CHECK-NEXT: [[INDVARS_IV27:%.*]] = phi i64 [ 0, [[OUTER_PREHEADER:%.*]] ], [ [[INDVARS_IV_NEXT28:%.*]], [[OUTER_LATCH:%.*]] ] -; CHECK-NEXT: [[CMP222:%.*]] = icmp sgt i32 [[M]], 0 +; CHECK-NEXT: [[CMP222:%.*]] = icmp sgt i32 [[M:%.*]], 0 +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[M]] to i64 ; CHECK-NEXT: br i1 [[CMP222]], label [[INNER_FOR_BODY_SPLIT1:%.*]], label [[OUTER_CRIT_EDGE:%.*]] ; CHECK: inner.preheader: ; CHECK-NEXT: [[WIDE_TRIP_COUNT29:%.*]] = zext i32 [[N]] to i64 @@ -41,8 +41,9 @@ define void @lcssa_08(i32 %n, i32 %m) { ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br label [[INNER_CRIT_EDGE:%.*]] ; CHECK: inner.for.body.split: +; CHECK-NEXT: [[WIDE_TRIP_COUNT_LCSSA2:%.*]] = phi i64 [ [[WIDE_TRIP_COUNT]], [[OUTER_LATCH]] ] ; CHECK-NEXT: [[TMP1]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[TMP1]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: [[TMP2:%.*]] = icmp ne i64 [[TMP1]], [[WIDE_TRIP_COUNT_LCSSA2]] ; CHECK-NEXT: br i1 [[TMP2]], label [[INNER_FOR_BODY]], label [[OUTER_CRIT_EDGE]] ; CHECK: inner.crit_edge: ; CHECK-NEXT: br label [[OUTER_LATCH]] diff --git a/test/Transforms/LoopInterchange/pr45743-move-from-inner-preheader.ll b/test/Transforms/LoopInterchange/pr45743-move-from-inner-preheader.ll new file mode 100644 index 00000000000..f50fbb0da8e --- /dev/null +++ b/test/Transforms/LoopInterchange/pr45743-move-from-inner-preheader.ll @@ -0,0 +1,141 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-interchange -S %s | FileCheck %s + +@global = external local_unnamed_addr global [2 x [10 x i32]], align 16 + +; We need to move %tmp4 from the inner loop pre header to the outer loop header +; before interchanging. +define void @test1() local_unnamed_addr #0 { +; CHECK-LABEL: @test1( +; CHECK-NEXT: bb: +; CHECK-NEXT: br label [[INNER_PH:%.*]] +; CHECK: outer.header.preheader: +; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] +; CHECK: outer.header: +; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i64 [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, [[OUTER_HEADER_PREHEADER:%.*]] ] +; CHECK-NEXT: [[INNER_RED:%.*]] = phi i32 [ [[OUTER_RED:%.*]], [[OUTER_HEADER_PREHEADER]] ], [ [[RED_NEXT:%.*]], [[OUTER_LATCH]] ] +; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[OUTER_IV]], 9 +; CHECK-NEXT: br label [[INNER_SPLIT1:%.*]] +; CHECK: inner.ph: +; CHECK-NEXT: br label [[INNER:%.*]] +; CHECK: inner: +; CHECK-NEXT: [[INNER_IV:%.*]] = phi i64 [ 0, [[INNER_PH]] ], [ [[TMP0:%.*]], [[INNER_SPLIT:%.*]] ] +; CHECK-NEXT: [[OUTER_RED]] = phi i32 [ [[RED_NEXT_LCSSA:%.*]], [[INNER_SPLIT]] ], [ 0, [[INNER_PH]] ] +; CHECK-NEXT: br label [[OUTER_HEADER_PREHEADER]] +; CHECK: inner.split1: +; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds [2 x [10 x i32]], [2 x [10 x i32]]* @global, i64 0, i64 [[INNER_IV]], i64 [[TMP4]] +; CHECK-NEXT: store i32 0, i32* [[PTR]], align 4 +; CHECK-NEXT: [[RED_NEXT]] = or i32 [[INNER_RED]], 20 +; CHECK-NEXT: [[INNER_IV_NEXT:%.*]] = add nsw i64 [[INNER_IV]], 1 +; CHECK-NEXT: [[EC_1:%.*]] = icmp eq i64 [[INNER_IV_NEXT]], 400 +; CHECK-NEXT: br label [[OUTER_LATCH]] +; CHECK: inner.split: +; CHECK-NEXT: [[RED_NEXT_LCSSA]] = phi i32 [ [[RED_NEXT]], [[OUTER_LATCH]] ] +; CHECK-NEXT: [[TMP0]] = add nsw i64 [[INNER_IV]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[TMP0]], 400 +; CHECK-NEXT: br i1 [[TMP1]], label [[EXIT:%.*]], label [[INNER]] +; CHECK: outer.latch: +; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nsw i64 [[OUTER_IV]], 1 +; CHECK-NEXT: [[EC_2:%.*]] = icmp eq i64 [[OUTER_IV_NEXT]], 400 +; CHECK-NEXT: br i1 [[EC_2]], label [[INNER_SPLIT]], label [[OUTER_HEADER]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +bb: + br label %outer.header + +outer.header: ; preds = %bb11, %bb + %outer.iv = phi i64 [ 0, %bb ], [ %outer.iv.next, %outer.latch ] + %outer.red = phi i32 [ 0, %bb ], [ %red.next.lcssa, %outer.latch ] + br label %inner.ph + +inner.ph: ; preds = %bb1 + %tmp4 = add nsw i64 %outer.iv, 9 + br label %inner + +inner: ; preds = %bb5, %bb3 + %inner.iv = phi i64 [ 0, %inner.ph ], [ %inner.iv.next, %inner ] + %inner.red = phi i32 [ %outer.red, %inner.ph ], [ %red.next, %inner ] + %ptr = getelementptr inbounds [2 x [10 x i32]], [2 x [10 x i32]]* @global, i64 0, i64 %inner.iv, i64 %tmp4 + store i32 0, i32* %ptr + %red.next = or i32 %inner.red, 20 + %inner.iv.next = add nsw i64 %inner.iv, 1 + %ec.1 = icmp eq i64 %inner.iv.next, 400 + br i1 %ec.1, label %outer.latch, label %inner + +outer.latch: ; preds = %bb5 + %red.next.lcssa = phi i32 [ %red.next, %inner ] + %outer.iv.next = add nsw i64 %outer.iv, 1 + %ec.2 = icmp eq i64 %outer.iv.next, 400 + br i1 %ec.2, label %exit, label %outer.header + +exit: ; preds = %bb11 + ret void +} + +declare void @side_effect() + +; Cannot interchange, as the inner loop preheader contains a call to a function +; with side effects. + +define void @test2() { +; CHECK-LABEL: @test2( +; CHECK-NEXT: bb: +; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] +; CHECK: outer.header: +; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i64 [ 0, [[BB:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ] +; CHECK-NEXT: [[OUTER_RED:%.*]] = phi i32 [ 0, [[BB]] ], [ [[RED_NEXT_LCSSA:%.*]], [[OUTER_LATCH]] ] +; CHECK-NEXT: br label [[INNER_PH:%.*]] +; CHECK: inner.ph: +; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[OUTER_IV]], 9 +; CHECK-NEXT: call void @side_effect() +; CHECK-NEXT: br label [[INNER:%.*]] +; CHECK: inner: +; CHECK-NEXT: [[INNER_IV:%.*]] = phi i64 [ 0, [[INNER_PH]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER]] ] +; CHECK-NEXT: [[INNER_RED:%.*]] = phi i32 [ [[OUTER_RED]], [[INNER_PH]] ], [ [[RED_NEXT:%.*]], [[INNER]] ] +; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds [2 x [10 x i32]], [2 x [10 x i32]]* @global, i64 0, i64 [[INNER_IV]], i64 [[TMP4]] +; CHECK-NEXT: store i32 0, i32* [[PTR]], align 4 +; CHECK-NEXT: [[RED_NEXT]] = or i32 [[INNER_RED]], 20 +; CHECK-NEXT: [[INNER_IV_NEXT]] = add nsw i64 [[INNER_IV]], 1 +; CHECK-NEXT: [[EC_1:%.*]] = icmp eq i64 [[INNER_IV_NEXT]], 400 +; CHECK-NEXT: br i1 [[EC_1]], label [[OUTER_LATCH]], label [[INNER]] +; CHECK: outer.latch: +; CHECK-NEXT: [[RED_NEXT_LCSSA]] = phi i32 [ [[RED_NEXT]], [[INNER]] ] +; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nsw i64 [[OUTER_IV]], 1 +; CHECK-NEXT: [[EC_2:%.*]] = icmp eq i64 [[OUTER_IV_NEXT]], 400 +; CHECK-NEXT: br i1 [[EC_2]], label [[EXIT:%.*]], label [[OUTER_HEADER]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +bb: + br label %outer.header + +outer.header: ; preds = %bb11, %bb + %outer.iv = phi i64 [ 0, %bb ], [ %outer.iv.next, %outer.latch ] + %outer.red = phi i32 [ 0, %bb ], [ %red.next.lcssa, %outer.latch ] + br label %inner.ph + +inner.ph: ; preds = %bb1 + %tmp4 = add nsw i64 %outer.iv, 9 + call void @side_effect() + br label %inner + +inner: ; preds = %bb5, %bb3 + %inner.iv = phi i64 [ 0, %inner.ph ], [ %inner.iv.next, %inner ] + %inner.red = phi i32 [ %outer.red, %inner.ph ], [ %red.next, %inner ] + %ptr = getelementptr inbounds [2 x [10 x i32]], [2 x [10 x i32]]* @global, i64 0, i64 %inner.iv, i64 %tmp4 + store i32 0, i32* %ptr + %red.next = or i32 %inner.red, 20 + %inner.iv.next = add nsw i64 %inner.iv, 1 + %ec.1 = icmp eq i64 %inner.iv.next, 400 + br i1 %ec.1, label %outer.latch, label %inner + +outer.latch: ; preds = %bb5 + %red.next.lcssa = phi i32 [ %red.next, %inner ] + %outer.iv.next = add nsw i64 %outer.iv, 1 + %ec.2 = icmp eq i64 %outer.iv.next, 400 + br i1 %ec.2, label %exit, label %outer.header + +exit: ; preds = %bb11 + ret void +}