1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00
llvm-mirror/test/CodeGen/Thumb2/mve-phireg.ll
Tomas Matheson bb84fe4048 [CodeGen][regalloc] Don't align stack slots if the stack can't be realigned
Register allocation may spill virtual registers to the stack, which can
increase alignment requirements of the stack frame. If the the function
did not require stack realignment before register allocation, the
registers required to do so may not be reserved/available. This results
in a stack frame that requires realignment but can not be realigned.

Instead, only increase the alignment of the stack if we are still able
to realign.

The register SpillAlignment will be ignored if we can't realign, and the
backend will be responsible for emitting the correct unaligned loads and
stores. This seems to be the assumed behaviour already, e.g.
ARMBaseInstrInfo::storeRegToStackSlot and X86InstrInfo::storeRegToStackSlot
are both `canRealignStack` aware.

Differential Revision: https://reviews.llvm.org/D103602
2021-06-11 16:49:12 +01:00

275 lines
12 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -O3 -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp --arm-memtransfer-tploop=allow -enable-arm-maskedgatscat=false -verify-machineinstrs %s -o - | FileCheck %s
; verify-machineinstrs previously caught the incorrect use of QPR in the stack reloads.
define arm_aapcs_vfpcc void @k() {
; CHECK-LABEL: k:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
; CHECK-NEXT: .pad #4
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14}
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: adr.w r8, .LCPI0_0
; CHECK-NEXT: adr.w r9, .LCPI0_1
; CHECK-NEXT: vldrw.u32 q6, [r8]
; CHECK-NEXT: vldrw.u32 q5, [r9]
; CHECK-NEXT: vmov.i32 q0, #0x1
; CHECK-NEXT: vmov.i8 q1, #0x0
; CHECK-NEXT: vmov.i8 q2, #0xff
; CHECK-NEXT: vmov.i16 q3, #0x6
; CHECK-NEXT: vmov.i16 q4, #0x3
; CHECK-NEXT: mov.w r12, #0
; CHECK-NEXT: .LBB0_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vand q6, q6, q0
; CHECK-NEXT: vand q5, q5, q0
; CHECK-NEXT: vcmp.i32 eq, q6, zr
; CHECK-NEXT: cmp.w r12, #0
; CHECK-NEXT: vpsel q6, q2, q1
; CHECK-NEXT: vcmp.i32 eq, q5, zr
; CHECK-NEXT: vpsel q5, q2, q1
; CHECK-NEXT: vmov r4, r0, d12
; CHECK-NEXT: vmov r3, r6, d10
; CHECK-NEXT: vmov r1, r2, d11
; CHECK-NEXT: vmov.16 q5[0], r3
; CHECK-NEXT: vmov.16 q5[1], r6
; CHECK-NEXT: vmov r5, r7, d13
; CHECK-NEXT: vmov.16 q5[2], r1
; CHECK-NEXT: vmov.16 q5[3], r2
; CHECK-NEXT: vmov.16 q5[4], r4
; CHECK-NEXT: vmov.16 q5[5], r0
; CHECK-NEXT: vmov.16 q5[6], r5
; CHECK-NEXT: vmov.16 q5[7], r7
; CHECK-NEXT: vcmp.i16 ne, q5, zr
; CHECK-NEXT: vmov.i32 q5, #0x0
; CHECK-NEXT: vpsel q6, q4, q3
; CHECK-NEXT: vstrh.16 q6, [r0]
; CHECK-NEXT: vmov q6, q5
; CHECK-NEXT: bne .LBB0_1
; CHECK-NEXT: @ %bb.2: @ %for.cond4.preheader
; CHECK-NEXT: movs r6, #0
; CHECK-NEXT: cbnz r6, .LBB0_5
; CHECK-NEXT: .LBB0_3: @ %for.body10
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: cbnz r6, .LBB0_4
; CHECK-NEXT: le .LBB0_3
; CHECK-NEXT: .LBB0_4: @ %for.cond4.loopexit
; CHECK-NEXT: bl l
; CHECK-NEXT: .LBB0_5: @ %vector.body105.preheader
; CHECK-NEXT: vldrw.u32 q0, [r8]
; CHECK-NEXT: vldrw.u32 q1, [r9]
; CHECK-NEXT: vmov.i32 q2, #0x8
; CHECK-NEXT: .LBB0_6: @ %vector.body105
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vadd.i32 q1, q1, q2
; CHECK-NEXT: vadd.i32 q0, q0, q2
; CHECK-NEXT: cbz r6, .LBB0_7
; CHECK-NEXT: le .LBB0_6
; CHECK-NEXT: .LBB0_7: @ %vector.body115.ph
; CHECK-NEXT: vldrw.u32 q0, [r9]
; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
; CHECK-NEXT: @APP
; CHECK-NEXT: nop
; CHECK-NEXT: @NO_APP
; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload
; CHECK-NEXT: vmov.i32 q0, #0x4
; CHECK-NEXT: .LBB0_8: @ %vector.body115
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vadd.i32 q1, q1, q0
; CHECK-NEXT: b .LBB0_8
; CHECK-NEXT: .p2align 4
; CHECK-NEXT: @ %bb.9:
; CHECK-NEXT: .LCPI0_0:
; CHECK-NEXT: .long 4 @ 0x4
; CHECK-NEXT: .long 5 @ 0x5
; CHECK-NEXT: .long 6 @ 0x6
; CHECK-NEXT: .long 7 @ 0x7
; CHECK-NEXT: .LCPI0_1:
; CHECK-NEXT: .long 0 @ 0x0
; CHECK-NEXT: .long 1 @ 0x1
; CHECK-NEXT: .long 2 @ 0x2
; CHECK-NEXT: .long 3 @ 0x3
entry:
br label %vector.body
vector.body: ; preds = %vector.body, %entry
%vec.ind = phi <8 x i32> [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, %entry ], [ zeroinitializer, %vector.body ]
%0 = and <8 x i32> %vec.ind, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%1 = icmp eq <8 x i32> %0, zeroinitializer
%2 = select <8 x i1> %1, <8 x i16> <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>, <8 x i16> <i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6, i16 6>
%3 = bitcast i16* undef to <8 x i16>*
store <8 x i16> %2, <8 x i16>* %3, align 2
%4 = icmp eq i32 undef, 128
br i1 %4, label %for.cond4.preheader, label %vector.body
for.cond4.preheader: ; preds = %vector.body
br i1 undef, label %vector.body105, label %for.body10
for.cond4.loopexit: ; preds = %for.body10
%call5 = call arm_aapcs_vfpcc i32 bitcast (i32 (...)* @l to i32 ()*)()
br label %vector.body105
for.body10: ; preds = %for.body10, %for.cond4.preheader
%exitcond88 = icmp eq i32 undef, 7
br i1 %exitcond88, label %for.cond4.loopexit, label %for.body10
vector.body105: ; preds = %vector.body105, %for.cond4.loopexit, %for.cond4.preheader
%vec.ind113 = phi <8 x i32> [ %vec.ind.next114, %vector.body105 ], [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, %for.cond4.loopexit ], [ <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, %for.cond4.preheader ]
%5 = and <8 x i32> %vec.ind113, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%vec.ind.next114 = add <8 x i32> %vec.ind113, <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
%6 = icmp eq i32 undef, 256
br i1 %6, label %vector.body115.ph, label %vector.body105
vector.body115.ph: ; preds = %vector.body105
tail call void asm sideeffect "nop", "~{s0},~{s4},~{s8},~{s12},~{s16},~{s20},~{s24},~{s28},~{memory}"()
br label %vector.body115
vector.body115: ; preds = %vector.body115, %vector.body115.ph
%vec.ind123 = phi <4 x i32> [ %vec.ind.next124, %vector.body115 ], [ <i32 0, i32 1, i32 2, i32 3>, %vector.body115.ph ]
%7 = icmp eq <4 x i32> %vec.ind123, zeroinitializer
%vec.ind.next124 = add <4 x i32> %vec.ind123, <i32 4, i32 4, i32 4, i32 4>
br label %vector.body115
}
@a = external dso_local global i32, align 4
@b = dso_local local_unnamed_addr global i32 ptrtoint (i32* @a to i32), align 4
@c = dso_local global i32 2, align 4
@d = dso_local global i32 2, align 4
define dso_local i32 @e() #0 {
; CHECK-LABEL: e:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: .pad #408
; CHECK-NEXT: sub sp, #408
; CHECK-NEXT: movw r7, :lower16:.L_MergedGlobals
; CHECK-NEXT: vldr s12, .LCPI1_0
; CHECK-NEXT: movt r7, :upper16:.L_MergedGlobals
; CHECK-NEXT: vldr s15, .LCPI1_1
; CHECK-NEXT: mov r3, r7
; CHECK-NEXT: mov r4, r7
; CHECK-NEXT: ldr r0, [r3, #4]!
; CHECK-NEXT: movw r2, :lower16:e
; CHECK-NEXT: ldr r6, [r4, #8]!
; CHECK-NEXT: vmov r5, s15
; CHECK-NEXT: vmov s13, r3
; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: movt r2, :upper16:e
; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
; CHECK-NEXT: vmov q0[2], q0[0], r4, r4
; CHECK-NEXT: vmov s21, r2
; CHECK-NEXT: vmov.f32 s14, s13
; CHECK-NEXT: vmov q0[3], q0[1], r5, r2
; CHECK-NEXT: vmov.f32 s20, s12
; CHECK-NEXT: vdup.32 q7, r3
; CHECK-NEXT: vmov q6[2], q6[0], r3, r5
; CHECK-NEXT: vmov.f32 s22, s13
; CHECK-NEXT: vstrw.32 q0, [sp, #92]
; CHECK-NEXT: vmov q0, q7
; CHECK-NEXT: vmov q6[3], q6[1], r3, r2
; CHECK-NEXT: vmov q4, q7
; CHECK-NEXT: vmov.32 q0[0], r2
; CHECK-NEXT: vmov.32 q7[1], r2
; CHECK-NEXT: vmov.f32 s23, s15
; CHECK-NEXT: movs r1, #64
; CHECK-NEXT: str r0, [sp, #40]
; CHECK-NEXT: vstrw.32 q5, [r0]
; CHECK-NEXT: str r6, [r0]
; CHECK-NEXT: vstrw.32 q7, [r0]
; CHECK-NEXT: str r0, [r0]
; CHECK-NEXT: vstrw.32 q0, [r0]
; CHECK-NEXT: vstrw.32 q6, [r0]
; CHECK-NEXT: mov.w r8, #0
; CHECK-NEXT: vmov q1[2], q1[0], r4, r3
; CHECK-NEXT: vmov q2[2], q2[0], r3, r3
; CHECK-NEXT: mov.w r12, #4
; CHECK-NEXT: vmov q1[3], q1[1], r2, r4
; CHECK-NEXT: vmov q2[3], q2[1], r4, r5
; CHECK-NEXT: vmov.32 q4[0], r8
; CHECK-NEXT: @ implicit-def: $r2
; CHECK-NEXT: str.w r8, [sp, #44]
; CHECK-NEXT: vstrw.32 q3, [sp, #60]
; CHECK-NEXT: strh.w r12, [sp, #406]
; CHECK-NEXT: wlstp.8 lr, r1, .LBB1_2
; CHECK-NEXT: .LBB1_1: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
; CHECK-NEXT: vstrb.8 q0, [r2], #16
; CHECK-NEXT: letp lr, .LBB1_1
; CHECK-NEXT: .LBB1_2: @ %entry
; CHECK-NEXT: vstrw.32 q1, [r0]
; CHECK-NEXT: str.w r8, [r7]
; CHECK-NEXT: vstrw.32 q4, [r0]
; CHECK-NEXT: vstrw.32 q2, [r0]
; CHECK-NEXT: str.w r12, [sp, #324]
; CHECK-NEXT: .LBB1_3: @ %for.cond
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: b .LBB1_3
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: @ %bb.4:
; CHECK-NEXT: .LCPI1_0:
; CHECK-NEXT: .long 0x00000004 @ float 5.60519386E-45
; CHECK-NEXT: .LCPI1_1:
; CHECK-NEXT: .long 0x00000000 @ float 0
entry:
%f = alloca i16, align 2
%g = alloca [3 x [8 x [4 x i16*]]], align 4
store i16 4, i16* %f, align 2
%0 = load i32, i32* @c, align 4
%1 = load i32, i32* @d, align 4
%arrayinit.element7 = getelementptr inbounds [3 x [8 x [4 x i16*]]], [3 x [8 x [4 x i16*]]]* %g, i32 0, i32 0, i32 1, i32 1
%2 = bitcast i16** %arrayinit.element7 to i32*
store i32 %0, i32* %2, align 4
%arrayinit.element8 = getelementptr inbounds [3 x [8 x [4 x i16*]]], [3 x [8 x [4 x i16*]]]* %g, i32 0, i32 0, i32 1, i32 2
store i16* null, i16** %arrayinit.element8, align 4
%3 = bitcast i16** undef to i32*
store i32 %1, i32* %3, align 4
%4 = bitcast i16** undef to i32*
store i32 %0, i32* %4, align 4
%arrayinit.element13 = getelementptr inbounds [3 x [8 x [4 x i16*]]], [3 x [8 x [4 x i16*]]]* %g, i32 0, i32 0, i32 2, i32 2
%5 = bitcast i16** %arrayinit.element13 to <4 x i16*>*
store <4 x i16*> <i16* inttoptr (i32 4 to i16*), i16* bitcast (i32* @c to i16*), i16* bitcast (i32* @c to i16*), i16* null>, <4 x i16*>* %5, align 4
%arrayinit.element24 = getelementptr inbounds [3 x [8 x [4 x i16*]]], [3 x [8 x [4 x i16*]]]* %g, i32 0, i32 0, i32 4, i32 2
%6 = bitcast i16** %arrayinit.element24 to <4 x i16*>*
store <4 x i16*> <i16* bitcast (i32* @d to i16*), i16* null, i16* bitcast (i32* @d to i16*), i16* bitcast (i32 ()* @e to i16*)>, <4 x i16*>* %6, align 4
%7 = bitcast i16** undef to <4 x i16*>*
store <4 x i16*> <i16* inttoptr (i32 4 to i16*), i16* bitcast (i32 ()* @e to i16*), i16* bitcast (i32* @c to i16*), i16* null>, <4 x i16*>* %7, align 4
%8 = bitcast i16** undef to <4 x i16*>*
store <4 x i16*> <i16* bitcast (i32* @c to i16*), i16* bitcast (i32 ()* @e to i16*), i16* bitcast (i32* @c to i16*), i16* bitcast (i32* @c to i16*)>, <4 x i16*>* %8, align 4
%9 = bitcast i16** undef to <4 x i16*>*
store <4 x i16*> <i16* bitcast (i32 ()* @e to i16*), i16* bitcast (i32* @c to i16*), i16* bitcast (i32* @c to i16*), i16* bitcast (i32* @c to i16*)>, <4 x i16*>* %9, align 4
%10 = bitcast i16** undef to <4 x i16*>*
store <4 x i16*> <i16* bitcast (i32* @c to i16*), i16* bitcast (i32* @c to i16*), i16* null, i16* bitcast (i32 ()* @e to i16*)>, <4 x i16*>* %10, align 4
call void @llvm.memset.p0i8.i32(i8* nonnull align 4 dereferenceable(64) undef, i8 0, i32 64, i1 false)
%11 = bitcast i16** undef to <4 x i16*>*
store <4 x i16*> <i16* bitcast (i32* @d to i16*), i16* bitcast (i32 ()* @e to i16*), i16* bitcast (i32* @c to i16*), i16* bitcast (i32* @d to i16*)>, <4 x i16*>* %11, align 4
%12 = bitcast i16** undef to <4 x i16*>*
store <4 x i16*> <i16* null, i16* bitcast (i32* @c to i16*), i16* bitcast (i32* @c to i16*), i16* bitcast (i32* @c to i16*)>, <4 x i16*>* %12, align 4
%13 = bitcast i16** undef to <4 x i16*>*
store <4 x i16*> <i16* bitcast (i32* @c to i16*), i16* bitcast (i32* @d to i16*), i16* bitcast (i32* @c to i16*), i16* null>, <4 x i16*>* %13, align 4
%arrayinit.begin78 = getelementptr inbounds [3 x [8 x [4 x i16*]]], [3 x [8 x [4 x i16*]]]* %g, i32 0, i32 2, i32 3, i32 0
store i16* inttoptr (i32 4 to i16*), i16** %arrayinit.begin78, align 4
store i32 0, i32* @b, align 4
br label %for.cond
for.cond: ; preds = %for.cond, %entry
br label %for.cond
}
; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1 immarg) #1
; Function Attrs: argmemonly nounwind willreturn
declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #1
declare arm_aapcs_vfpcc i32 @l(...)