1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-21 03:53:04 +02:00

Recommit f0c2a5a "[LV] Generalize conditions for sinking instrs for first order recurrences."

This version contains 2 fixes for reported issues:
1. Make sure we do not try to sink terminator instructions.
2. Make sure we bail out, if we try to sink an instruction that needs to
   stay in place for another recurrence.

Original message:
If the recurrence PHI node has a single user, we can sink any
instruction without side effects, given that all users are dominated by
the instruction computing the incoming value of the next iteration
('Previous'). We can sink instructions that may cause traps, because
that only causes the trap to occur later, but not on any new paths.

With the relaxed check, we also have to make sure that we do not have a
direct cycle (meaning PHI user == 'Previous), which indicates a
reduction relation, which potentially gets missed by
ReductionDescriptor.

As follow-ups, we can also sink stores, iff they do not alias with
other instructions we move them across and we could also support sinking
chains of instructions and multiple users of the PHI.

Fixes PR43398.

Reviewers: hsaito, dcaballe, Ayal, rengolin

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D69228
This commit is contained in:
Florian Hahn 2019-11-24 20:43:12 +00:00
parent bb27468c44
commit 1f7f3a6e63
4 changed files with 332 additions and 14 deletions

View File

@ -699,25 +699,41 @@ bool RecurrenceDescriptor::isFirstOrderRecurrence(
// Ensure every user of the phi node is dominated by the previous value.
// The dominance requirement ensures the loop vectorizer will not need to
// vectorize the initial value prior to the first iteration of the loop.
// TODO: Consider extending this sinking to handle other kinds of instructions
// and expressions, beyond sinking a single cast past Previous.
// TODO: Consider extending this sinking to handle memory instructions and
// phis with multiple users.
// Returns true, if all users of I are dominated by DominatedBy.
auto allUsesDominatedBy = [DT](Instruction *I, Instruction *DominatedBy) {
return all_of(I->uses(), [DT, DominatedBy](Use &U) {
return DT->dominates(DominatedBy, U);
});
};
if (Phi->hasOneUse()) {
auto *I = Phi->user_back();
if (I->isCast() && (I->getParent() == Phi->getParent()) && I->hasOneUse() &&
DT->dominates(Previous, I->user_back())) {
if (!DT->dominates(Previous, I)) // Otherwise we're good w/o sinking.
SinkAfter[I] = Previous;
Instruction *I = Phi->user_back();
// If the user of the PHI is also the incoming value, we potentially have a
// reduction and which cannot be handled by sinking.
if (Previous == I)
return false;
// We cannot sink terminator instructions.
if (I->getParent()->getTerminator() == I)
return false;
if (DT->dominates(Previous, I)) // We already are good w/o sinking.
return true;
// We can sink any instruction without side effects, as long as all users
// are dominated by the instruction we are sinking after.
if (I->getParent() == Phi->getParent() && !I->mayHaveSideEffects() &&
allUsesDominatedBy(I, Previous)) {
SinkAfter[I] = Previous;
return true;
}
}
for (User *U : Phi->users())
if (auto *I = dyn_cast<Instruction>(U)) {
if (!DT->dominates(Previous, I))
return false;
}
return true;
return allUsesDominatedBy(Phi, Previous);
}
/// This function returns the identity element (or neutral element) for

View File

@ -815,6 +815,18 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
}
}
// For first order recurrences, we use the previous value (incoming value from
// the latch) to check if it dominates all users of the recurrence. Bail out
// if we have to sink such an instruction for another recurrence, as the
// dominance requirement may not hold after sinking.
BasicBlock *LoopLatch = TheLoop->getLoopLatch();
if (any_of(FirstOrderRecurrences, [LoopLatch, this](const PHINode *Phi) {
Instruction *V =
cast<Instruction>(Phi->getIncomingValueForBlock(LoopLatch));
return SinkAfter.find(V) != SinkAfter.end();
}))
return false;
// Now we know the widest induction type, check if our found induction
// is the same size. If it's not, unset it here and InnerLoopVectorizer
// will create another.

View File

@ -0,0 +1,245 @@
; RUN: opt -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
@p = external local_unnamed_addr global [257 x i32], align 16
@q = external local_unnamed_addr global [257 x i32], align 16
; Test case for PR43398.
define void @can_sink_after_store(i32 %x, i32* %ptr, i64 %tc) local_unnamed_addr #0 {
; CHECK-LABEL: vector.ph:
; CHECK: %broadcast.splatinsert1 = insertelement <4 x i32> undef, i32 %x, i32 0
; CHECK-NEXT: %broadcast.splat2 = shufflevector <4 x i32> %broadcast.splatinsert1, <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3
; CHECK-NEXT: br label %vector.body
; CHECK-LABEL: vector.body:
; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK-NEXT: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ]
; CHECK-NEXT: %offset.idx = add i64 1, %index
; CHECK-NEXT: %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %offset.idx, i32 0
; CHECK-NEXT: %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3>
; CHECK-NEXT: %0 = add i64 %offset.idx, 0
; CHECK-NEXT: %1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %0
; CHECK-NEXT: %2 = getelementptr inbounds i32, i32* %1, i32 0
; CHECK-NEXT: %3 = bitcast i32* %2 to <4 x i32>*
; CHECK-NEXT: %wide.load = load <4 x i32>, <4 x i32>* %3, align 4
; CHECK-NEXT: %4 = shufflevector <4 x i32> %vector.recur, <4 x i32> %wide.load, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; CHECK-NEXT: %5 = add <4 x i32> %4, %broadcast.splat2
; CHECK-NEXT: %6 = add <4 x i32> %5, %wide.load
; CHECK-NEXT: %7 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %0
; CHECK-NEXT: %8 = getelementptr inbounds i32, i32* %7, i32 0
; CHECK-NEXT: %9 = bitcast i32* %8 to <4 x i32>*
; CHECK-NEXT: store <4 x i32> %6, <4 x i32>* %9, align 4
; CHECK-NEXT: %index.next = add i64 %index, 4
; CHECK-NEXT: %10 = icmp eq i64 %index.next, 1996
; CHECK-NEXT: br i1 %10, label %middle.block, label %vector.body
;
entry:
br label %preheader
preheader:
%idx.phi.trans = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 1
%.pre = load i32, i32* %idx.phi.trans, align 4
br label %for
for:
%pre.phi = phi i32 [ %.pre, %preheader ], [ %pre.next, %for ]
%iv = phi i64 [ 1, %preheader ], [ %iv.next, %for ]
%add.1 = add i32 %pre.phi, %x
%idx.1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %iv
%pre.next = load i32, i32* %idx.1, align 4
%add.2 = add i32 %add.1, %pre.next
%idx.2 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %iv
store i32 %add.2, i32* %idx.2, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 2000
br i1 %exitcond, label %exit, label %for
exit:
ret void
}
; We can sink potential trapping instructions, as this will only delay the trap
; and not introduce traps on additional paths.
define void @sink_sdiv(i32 %x, i32* %ptr, i64 %tc) local_unnamed_addr #0 {
; CHECK-LABEL: vector.ph:
; CHECK: %broadcast.splatinsert1 = insertelement <4 x i32> undef, i32 %x, i32 0
; CHECK-NEXT: %broadcast.splat2 = shufflevector <4 x i32> %broadcast.splatinsert1, <4 x i32> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3
; CHECK-NEXT: br label %vector.body
; CHECK-LABEL: vector.body:
; CHECK-NEXT: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
; CHECK-NEXT: %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ %wide.load, %vector.body ]
; CHECK-NEXT: %offset.idx = add i64 1, %index
; CHECK-NEXT: %broadcast.splatinsert = insertelement <4 x i64> undef, i64 %offset.idx, i32 0
; CHECK-NEXT: %broadcast.splat = shufflevector <4 x i64> %broadcast.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
; CHECK-NEXT: %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3>
; CHECK-NEXT: %0 = add i64 %offset.idx, 0
; CHECK-NEXT: %1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %0
; CHECK-NEXT: %2 = getelementptr inbounds i32, i32* %1, i32 0
; CHECK-NEXT: %3 = bitcast i32* %2 to <4 x i32>*
; CHECK-NEXT: %wide.load = load <4 x i32>, <4 x i32>* %3, align 4
; CHECK-NEXT: %4 = shufflevector <4 x i32> %vector.recur, <4 x i32> %wide.load, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
; CHECK-NEXT: %5 = sdiv <4 x i32> %4, %broadcast.splat2
; CHECK-NEXT: %6 = add <4 x i32> %5, %wide.load
; CHECK-NEXT: %7 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %0
; CHECK-NEXT: %8 = getelementptr inbounds i32, i32* %7, i32 0
; CHECK-NEXT: %9 = bitcast i32* %8 to <4 x i32>*
; CHECK-NEXT: store <4 x i32> %6, <4 x i32>* %9, align 4
; CHECK-NEXT: %index.next = add i64 %index, 4
; CHECK-NEXT: %10 = icmp eq i64 %index.next, 1996
; CHECK-NEXT: br i1 %10, label %middle.block, label %vector.body
;
entry:
br label %preheader
preheader:
%idx.phi.trans = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 1
%.pre = load i32, i32* %idx.phi.trans, align 4
br label %for
for:
%pre.phi = phi i32 [ %.pre, %preheader ], [ %pre.next, %for ]
%iv = phi i64 [ 1, %preheader ], [ %iv.next, %for ]
%div.1 = sdiv i32 %pre.phi, %x
%idx.1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %iv
%pre.next = load i32, i32* %idx.1, align 4
%add.2 = add i32 %div.1, %pre.next
%idx.2 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %iv
store i32 %add.2, i32* %idx.2, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 2000
br i1 %exitcond, label %exit, label %for
exit:
ret void
}
; FIXME: Currently we can only sink a single instruction. For the example below,
; we also have to sink users.
define void @cannot_sink_with_additional_user(i32 %x, i32* %ptr, i64 %tc) {
; CHECK-LABEL: define void @cannot_sink_with_additional_user(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label %preheader
; CHECK-LABEL: preheader: ; preds = %entry
; CHECK: br label %for
; CHECK-LABEL: for: ; preds = %for, %preheader
; CHECK br i1 %exitcond, label %exit, label %for
; CHECK-LABEL: exit:
; CHECK-NEXT: ret void
entry:
br label %preheader
preheader:
%idx.phi.trans = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 1
%.pre = load i32, i32* %idx.phi.trans, align 4
br label %for
for:
%pre.phi = phi i32 [ %.pre, %preheader ], [ %pre.next, %for ]
%iv = phi i64 [ 1, %preheader ], [ %iv.next, %for ]
%add.1 = add i32 %pre.phi, %x
%add.2 = add i32 %add.1, %x
%idx.1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %iv
%pre.next = load i32, i32* %idx.1, align 4
%add.3 = add i32 %add.1, %pre.next
%add.4 = add i32 %add.2, %add.3
%idx.2 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %iv
store i32 %add.4, i32* %idx.2, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 2000
br i1 %exitcond, label %exit, label %for
exit:
ret void
}
; FIXME: We can sink a store, if we can guarantee that it does not alias any
; loads/stores in between.
define void @cannot_sink_store(i32 %x, i32* %ptr, i64 %tc) {
; CHECK-LABEL: define void @cannot_sink_store(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label %preheader
; CHECK-LABEL: preheader: ; preds = %entry
; CHECK: br label %for
; CHECK-LABEL: for: ; preds = %for, %preheader
; CHECK br i1 %exitcond, label %exit, label %for
; CHECK-LABEL: exit:
; CHECK-NEXT: ret void
;
entry:
br label %preheader
preheader:
%idx.phi.trans = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 1
%.pre = load i32, i32* %idx.phi.trans, align 4
br label %for
for:
%pre.phi = phi i32 [ %.pre, %preheader ], [ %pre.next, %for ]
%iv = phi i64 [ 1, %preheader ], [ %iv.next, %for ]
%add.1 = add i32 %pre.phi, %x
store i32 %add.1, i32* %ptr
%idx.1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %iv
%pre.next = load i32, i32* %idx.1, align 4
%add.2 = add i32 %add.1, %pre.next
%idx.2 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %iv
store i32 %add.2, i32* %idx.2, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 2000
br i1 %exitcond, label %exit, label %for
exit:
ret void
}
; Some kinds of reductions are not detected by IVDescriptors. If we have a
; cycle, we cannot sink it.
define void @cannot_sink_reduction(i32 %x, i32* %ptr, i64 %tc) {
; CHECK-LABEL: define void @cannot_sink_reduction(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label %preheader
; CHECK-LABEL: preheader: ; preds = %entry
; CHECK: br label %for
; CHECK-LABEL: for: ; preds = %for, %preheader
; CHECK br i1 %exitcond, label %exit, label %for
; CHECK-LABEL: exit: ; preds = %for
; CHECK-NET: ret void
;
entry:
br label %preheader
preheader:
%idx.phi.trans = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 1
%.pre = load i32, i32* %idx.phi.trans, align 4
br label %for
for:
%pre.phi = phi i32 [ %.pre, %preheader ], [ %d, %for ]
%iv = phi i64 [ 1, %preheader ], [ %iv.next, %for ]
%d = sdiv i32 %pre.phi, %x
%idx.1 = getelementptr inbounds [257 x i32], [257 x i32]* @p, i64 0, i64 %iv
%pre.next = load i32, i32* %idx.1, align 4
%add.2 = add i32 %x, %pre.next
%idx.2 = getelementptr inbounds [257 x i32], [257 x i32]* @q, i64 0, i64 %iv
store i32 %add.2, i32* %idx.2, align 4
%iv.next = add nuw nsw i64 %iv, 1
%exitcond = icmp eq i64 %iv.next, 2000
br i1 %exitcond, label %exit, label %for
exit:
ret void
}

View File

@ -0,0 +1,45 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN:opt -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S %s | FileCheck %s
; For %for.1, we are fine initially, because the previous value %for.1.next dominates the
; user of %for.1. But for %for.2, we have to sink the user (%for.1.next) past the previous
; value %for.2.next. This however breaks the condition we have for %for.1. We cannot fix
; both first order recurrences and cannot vectorize the loop.
define i32 @c(i32 %N) {
; CHECK-LABEL: @c(
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 10, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[FOR_1:%.*]] = phi i32 [ [[FOR_1_NEXT:%.*]], [[FOR_BODY]] ], [ 20, [[ENTRY]] ]
; CHECK-NEXT: [[FOR_2:%.*]] = phi i32 [ [[FOR_2_NEXT:%.*]], [[FOR_BODY]] ], [ 11, [[ENTRY]] ]
; CHECK-NEXT: [[FOR_1_NEXT]] = add nsw i32 [[FOR_2]], 1
; CHECK-NEXT: [[FOR_2_NEXT]] = shl i32 [[FOR_1]], 24
; CHECK-NEXT: [[INC]] = add nsw i32 [[IV]], 1
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], [[N:%.*]]
; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND1_FOR_END_CRIT_EDGE:%.*]], label [[FOR_BODY]]
; CHECK: for.cond1.for.end_crit_edge:
; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[FOR_1_NEXT]], [[FOR_BODY]] ]
; CHECK-NEXT: [[SEXT_LCSSA:%.*]] = phi i32 [ [[FOR_2_NEXT]], [[FOR_BODY]] ]
; CHECK-NEXT: [[RES:%.*]] = add i32 [[ADD_LCSSA]], [[SEXT_LCSSA]]
; CHECK-NEXT: ret i32 [[RES]]
;
entry:
br label %for.body
for.body: ; preds = %for.body.preheader, %for.body
%iv = phi i32 [ %inc, %for.body ], [ 10, %entry ]
%for.1 = phi i32 [ %for.1.next, %for.body ], [ 20, %entry ]
%for.2 = phi i32 [ %for.2.next, %for.body ], [ 11, %entry ]
%for.1.next = add nsw i32 %for.2, 1
%for.2.next = shl i32 %for.1, 24
%inc = add nsw i32 %iv, 1
%exitcond = icmp eq i32 %inc, %N
br i1 %exitcond, label %for.cond1.for.end_crit_edge, label %for.body
for.cond1.for.end_crit_edge: ; preds = %for.body
%add.lcssa = phi i32 [ %for.1.next, %for.body ]
%sext.lcssa = phi i32 [ %for.2.next, %for.body ]
%res = add i32 %add.lcssa, %sext.lcssa
ret i32 %res
}