mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 19:23:23 +01:00
fbea2e7ae6
It may be detrimental to vectorize loops with very small trip count, as various costs of the vectorized loop body as well as enclosing overheads including runtime tests and scalar iterations may outweigh the gains of vectorizing. The current cost model measures the cost of the vectorized loop body only, expecting it will amortize other costs, and loops with known or expected very small trip counts are not vectorized at all. This patch allows loops with very small trip counts to be vectorized, but under OptForSize constraints, which ensure the cost of the loop body is dominant, having no runtime guards nor scalar iterations. Patch inspired by D32451. Differential Revision: https://reviews.llvm.org/D34373 llvm-svn: 306803
58 lines
2.4 KiB
LLVM
58 lines
2.4 KiB
LLVM
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
|
|
|
|
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
|
|
|
|
@a = common global [2048 x i32] zeroinitializer, align 16
|
|
@b = common global [2048 x i32] zeroinitializer, align 16
|
|
@c = common global [2048 x i32] zeroinitializer, align 16
|
|
|
|
;CHECK-LABEL: @example1(
|
|
;CHECK: load <4 x i32>
|
|
;CHECK: ret void
|
|
define void @example1() nounwind uwtable ssp {
|
|
br label %1
|
|
|
|
; <label>:1 ; preds = %1, %0
|
|
%indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
|
|
%2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
|
|
%3 = load i32, i32* %2, align 4
|
|
%4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
|
|
%5 = load i32, i32* %4, align 4
|
|
%6 = add nsw i32 %5, %3
|
|
%7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
|
|
store i32 %6, i32* %7, align 4
|
|
%indvars.iv.next = add i64 %indvars.iv, 1
|
|
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
|
%exitcond = icmp eq i32 %lftr.wideiv, 8 ; <----- A really small trip count
|
|
br i1 %exitcond, label %8, label %1 ; w/o scalar iteration overhead.
|
|
|
|
; <label>:8 ; preds = %1
|
|
ret void
|
|
}
|
|
|
|
;CHECK-LABEL: @bound1(
|
|
;CHECK-NOT: load <4 x i32>
|
|
;CHECK: ret void
|
|
define void @bound1(i32 %k) nounwind uwtable ssp {
|
|
br label %1
|
|
|
|
; <label>:1 ; preds = %1, %0
|
|
%indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
|
|
%2 = getelementptr inbounds [2048 x i32], [2048 x i32]* @b, i64 0, i64 %indvars.iv
|
|
%3 = load i32, i32* %2, align 4
|
|
%4 = getelementptr inbounds [2048 x i32], [2048 x i32]* @c, i64 0, i64 %indvars.iv
|
|
%5 = load i32, i32* %4, align 4
|
|
%6 = add nsw i32 %5, %3
|
|
%7 = getelementptr inbounds [2048 x i32], [2048 x i32]* @a, i64 0, i64 %indvars.iv
|
|
store i32 %6, i32* %7, align 4
|
|
%indvars.iv.next = add i64 %indvars.iv, 1
|
|
%lftr.wideiv = trunc i64 %indvars.iv.next to i32
|
|
%large = icmp sge i32 %lftr.wideiv, 8
|
|
%exitcond = icmp eq i32 %lftr.wideiv, %k
|
|
%realexit = or i1 %large, %exitcond
|
|
br i1 %realexit, label %8, label %1
|
|
|
|
; <label>:8 ; preds = %1
|
|
ret void
|
|
}
|