mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-26 12:43:36 +01:00
c208ece583
This is a simple pass that flattens nested loops. The intention is to optimise loop nests like this, which together access an array linearly: for (int i = 0; i < N; ++i) for (int j = 0; j < M; ++j) f(A[i*M+j]); into one loop: for (int i = 0; i < (N*M); ++i) f(A[i]); It can also flatten loops where the induction variables are not used in the loop. This can help with codesize and runtime, especially on simple cpus without advanced branch prediction. This is only worth flattening if the induction variables are only used in an expression like i*M+j. If they had any other uses, we would have to insert a div/mod to reconstruct the original values, so this wouldn't be profitable. This partially fixes PR40581 as this pass triggers on one of the two cases. I will follow up on this to learn LoopFlatten a few more (small) tricks. Please note that LoopFlatten is not yet enabled by default. Patch by Oliver Stannard, with minor tweaks from Dave Green and myself. Differential Revision: https://reviews.llvm.org/D42365
109 lines
3.6 KiB
LLVM
109 lines
3.6 KiB
LLVM
; RUN: opt < %s -S -loop-flatten -verify-loop-info -verify-dom-info -verify-scev -verify | FileCheck %s
|
|
|
|
target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
|
|
|
|
; Test case and IR corresponding to this code:
|
|
;
|
|
; int k = 0;
|
|
; for(int i = 0; i < n; i++)
|
|
; for(int j = 0; j < n; j++) {
|
|
; A[k] = B[k];
|
|
; k++;
|
|
; }
|
|
;
|
|
; TODO: this case doesn't trigger yet.
|
|
;
|
|
define dso_local void @v0(i32 %n, i32* nocapture %A, i32* nocapture readonly %B) local_unnamed_addr #0 {
|
|
;
|
|
; CHECK-LABEL: @v0
|
|
; CHECK-NOT: %flatten.tripcount = mul i32 %n, %n
|
|
;
|
|
entry:
|
|
%cmp21 = icmp sgt i32 %n, 0
|
|
br i1 %cmp21, label %for.cond1.preheader.us.preheader, label %for.cond.cleanup
|
|
|
|
for.cond1.preheader.us.preheader:
|
|
br label %for.cond1.preheader.us
|
|
|
|
for.cond1.preheader.us:
|
|
%i.023.us = phi i32 [ %inc8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
|
|
%k.022.us = phi i32 [ %inc.us.lcssa, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
|
|
%0 = add i32 %n, %k.022.us
|
|
br label %for.body4.us
|
|
|
|
for.body4.us:
|
|
%k.119.us = phi i32 [ %k.022.us, %for.cond1.preheader.us ], [ %inc.us, %for.body4.us ]
|
|
%arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %k.119.us
|
|
%1 = load i32, i32* %arrayidx.us, align 4
|
|
%arrayidx5.us = getelementptr inbounds i32, i32* %A, i32 %k.119.us
|
|
store i32 %1, i32* %arrayidx5.us, align 4
|
|
%inc.us = add i32 %k.119.us, 1
|
|
%exitcond = icmp ne i32 %inc.us, %0
|
|
br i1 %exitcond, label %for.body4.us, label %for.cond1.for.cond.cleanup3_crit_edge.us
|
|
|
|
for.cond1.for.cond.cleanup3_crit_edge.us:
|
|
%inc.us.lcssa = phi i32 [ %inc.us, %for.body4.us ]
|
|
%inc8.us = add nuw nsw i32 %i.023.us, 1
|
|
%cmp.us = icmp slt i32 %inc8.us, %n
|
|
br i1 %cmp.us, label %for.cond1.preheader.us, label %for.cond.cleanup.loopexit
|
|
|
|
for.cond.cleanup.loopexit:
|
|
br label %for.cond.cleanup
|
|
|
|
for.cond.cleanup:
|
|
ret void
|
|
}
|
|
|
|
; Test case and IR corresponding to this code:
|
|
;
|
|
; for(int i = 0; i < n; i++)
|
|
; for(int j = 0; j < n; j++) {
|
|
; int k = i*n+j;
|
|
; A[k] = B[k];
|
|
; k++;
|
|
; }
|
|
;
|
|
define dso_local void @v1(i32 %n, i32* nocapture %A, i32* nocapture readonly %B) local_unnamed_addr #0 {
|
|
;
|
|
; CHECK-LABEL: @v1
|
|
; CHECK: for.cond1.preheader.us.preheader:
|
|
; CHECK: %flatten.tripcount = mul i32 %n, %n
|
|
; CHECK: for.cond1.for.cond.cleanup3_crit_edge.us:
|
|
; CHECK: %inc8.us = add nuw nsw i32 %i.024.us, 1
|
|
; CHECK: %cmp.us = icmp slt i32 %inc8.us, %flatten.tripcount
|
|
;
|
|
entry:
|
|
%cmp23 = icmp sgt i32 %n, 0
|
|
br i1 %cmp23, label %for.cond1.preheader.us.preheader, label %for.cond.cleanup
|
|
|
|
for.cond1.preheader.us.preheader:
|
|
br label %for.cond1.preheader.us
|
|
|
|
for.cond1.preheader.us:
|
|
%i.024.us = phi i32 [ %inc8.us, %for.cond1.for.cond.cleanup3_crit_edge.us ], [ 0, %for.cond1.preheader.us.preheader ]
|
|
%mul.us = mul nsw i32 %i.024.us, %n
|
|
br label %for.body4.us
|
|
|
|
for.body4.us:
|
|
%j.022.us = phi i32 [ 0, %for.cond1.preheader.us ], [ %inc6.us, %for.body4.us ]
|
|
%add.us = add nsw i32 %j.022.us, %mul.us
|
|
%arrayidx.us = getelementptr inbounds i32, i32* %B, i32 %add.us
|
|
%0 = load i32, i32* %arrayidx.us, align 4
|
|
%arrayidx5.us = getelementptr inbounds i32, i32* %A, i32 %add.us
|
|
store i32 %0, i32* %arrayidx5.us, align 4
|
|
%inc6.us = add nuw nsw i32 %j.022.us, 1
|
|
%exitcond = icmp ne i32 %inc6.us, %n
|
|
br i1 %exitcond, label %for.body4.us, label %for.cond1.for.cond.cleanup3_crit_edge.us
|
|
|
|
for.cond1.for.cond.cleanup3_crit_edge.us:
|
|
%inc8.us = add nuw nsw i32 %i.024.us, 1
|
|
%cmp.us = icmp slt i32 %inc8.us, %n
|
|
br i1 %cmp.us, label %for.cond1.preheader.us, label %for.cond.cleanup.loopexit
|
|
|
|
for.cond.cleanup.loopexit:
|
|
br label %for.cond.cleanup
|
|
|
|
for.cond.cleanup:
|
|
ret void
|
|
}
|