mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-25 04:02:41 +01:00
9530f416eb
The Technical Reference Manuals for these two CPUs state that branching to an unaligned 32-bit instruction incurs an extra pipeline reload penalty. That's bad. This also enables the optimization at -Os since it costs on average one byte per loop in return for 1 cycle per iteration, which is pretty good going. llvm-svn: 342127
50 lines
1.4 KiB
LLVM
50 lines
1.4 KiB
LLVM
; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m3 -o - | FileCheck %s
|
|
; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m4 -o - | FileCheck %s
|
|
; RUN: llc -mtriple=thumbv7m-none-eabi %s -mcpu=cortex-m33 -o - | FileCheck %s
|
|
|
|
define void @test_loop_alignment(i32* %in, i32* %out) optsize {
|
|
; CHECK-LABEL: test_loop_alignment:
|
|
; CHECK: movs {{r[0-9]+}}, #0
|
|
; CHECK: .p2align 2
|
|
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
|
|
%in.addr = getelementptr inbounds i32, i32* %in, i32 %i
|
|
%lhs = load i32, i32* %in.addr, align 4
|
|
%res = mul nsw i32 %lhs, 5
|
|
%out.addr = getelementptr inbounds i32, i32* %out, i32 %i
|
|
store i32 %res, i32* %out.addr, align 4
|
|
%i.next = add i32 %i, 1
|
|
%done = icmp eq i32 %i.next, 1024
|
|
br i1 %done, label %end, label %loop
|
|
|
|
end:
|
|
ret void
|
|
}
|
|
|
|
define void @test_loop_alignment_minsize(i32* %in, i32* %out) minsize {
|
|
; CHECK-LABEL: test_loop_alignment_minsize:
|
|
; CHECK: movs {{r[0-9]+}}, #0
|
|
; CHECK-NOT: .p2align
|
|
|
|
entry:
|
|
br label %loop
|
|
|
|
loop:
|
|
%i = phi i32 [ 0, %entry ], [ %i.next, %loop ]
|
|
%in.addr = getelementptr inbounds i32, i32* %in, i32 %i
|
|
%lhs = load i32, i32* %in.addr, align 4
|
|
%res = mul nsw i32 %lhs, 5
|
|
%out.addr = getelementptr inbounds i32, i32* %out, i32 %i
|
|
store i32 %res, i32* %out.addr, align 4
|
|
%i.next = add i32 %i, 1
|
|
%done = icmp eq i32 %i.next, 1024
|
|
br i1 %done, label %end, label %loop
|
|
|
|
end:
|
|
ret void
|
|
}
|