mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 12:41:49 +01:00
[SelectionDAG] Don't promote the alignment of allocas beyond the stack alignment.
allocas in LLVM IR have a specified alignment. When that alignment is specified, the alloca has at least that alignment at runtime. If the specified type of the alloca has a higher preferred alignment, SelectionDAG currently ignores that specified alignment, and increases the alignment. It does this even if it would trigger stack realignment. I don't think this makes sense, so this patch changes that. I was looking into this for SVE in particular: for SVE, overaligning vscale'ed types is extra expensive because it requires realigning the stack multiple times, or using dynamic allocation. (This currently isn't implemented.) I updated the expected assembly for a couple tests; in particular, for arg-copy-elide.ll, the optimization in question does not increase the alignment the way SelectionDAG normally would. For the rest, I just increased the specified alignment on the allocas to match what SelectionDAG was inferring. Differential Revision: https://reviews.llvm.org/D79532
This commit is contained in:
parent
6ace50965a
commit
f704804dd2
@ -134,8 +134,20 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
|
||||
for (const Instruction &I : BB) {
|
||||
if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
|
||||
Type *Ty = AI->getAllocatedType();
|
||||
Align TyPrefAlign = MF->getDataLayout().getPrefTypeAlign(Ty);
|
||||
// The "specified" alignment is the alignment written on the alloca,
|
||||
// or the preferred alignment of the type if none is specified.
|
||||
//
|
||||
// (Unspecified alignment on allocas will be going away soon.)
|
||||
Align SpecifiedAlign = AI->getAlign() ? *AI->getAlign() : TyPrefAlign;
|
||||
|
||||
// If the preferred alignment of the type is higher than the specified
|
||||
// alignment of the alloca, promote the alignment, as long as it doesn't
|
||||
// require realigning the stack.
|
||||
//
|
||||
// FIXME: Do we really want to second-guess the IR in isel?
|
||||
Align Alignment =
|
||||
max(MF->getDataLayout().getPrefTypeAlign(Ty), AI->getAlign());
|
||||
std::max(std::min(TyPrefAlign, StackAlign), SpecifiedAlign);
|
||||
|
||||
// Static allocas can be folded into the initial stack frame
|
||||
// adjustment. For targets that don't realign the stack, don't
|
||||
|
@ -15,3 +15,18 @@ define i32 @foo(<vscale x 16 x i8> %val) {
|
||||
}
|
||||
|
||||
declare i32 @bar(<vscale x 16 x i8>* %ptr);
|
||||
|
||||
; CHECKCG-LABEL: foo2:
|
||||
; CHECKCG: addvl sp, sp, #-2
|
||||
|
||||
; CHECKISEL-LABEL: name: foo2
|
||||
; CHECKISEL: stack:
|
||||
; CHECKISEL: id: 0, name: ptr, type: default, offset: 0, size: 32, alignment: 16,
|
||||
; CHECKISEL-NEXT: stack-id: sve-vec
|
||||
|
||||
define i32 @foo2(<vscale x 32 x i8> %val) {
|
||||
%ptr = alloca <vscale x 32 x i8>, align 16
|
||||
%res = call i32 @bar2(<vscale x 32 x i8>* %ptr)
|
||||
ret i32 %res
|
||||
}
|
||||
declare i32 @bar2(<vscale x 32 x i8>* %ptr);
|
||||
|
@ -113,7 +113,7 @@ entry:
|
||||
%gep.index = getelementptr inbounds i32, i32 addrspace(1)* %index.array, i64 %idxprom
|
||||
%index.load = load i32, i32 addrspace(1)* %gep.index
|
||||
%index = and i32 %index.load, 2
|
||||
%alloca = alloca [2 x <8 x i32>], align 16, addrspace(5)
|
||||
%alloca = alloca [2 x <8 x i32>], align 32, addrspace(5)
|
||||
%gep0 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>] addrspace(5)* %alloca, i32 0, i32 0
|
||||
%gep1 = getelementptr inbounds [2 x <8 x i32>], [2 x <8 x i32>] addrspace(5)* %alloca, i32 0, i32 1
|
||||
store <8 x i32> zeroinitializer, <8 x i32> addrspace(5)* %gep0
|
||||
|
@ -51,8 +51,8 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <
|
||||
; GCN-NEXT: BB0_2: ; %shader_eval_surface.exit
|
||||
; GCN-NEXT: s_endpgm
|
||||
entry:
|
||||
%sd = alloca < 1339 x i32>, align 16, addrspace(5)
|
||||
%state = alloca <4 x i32>, align 4, addrspace(5)
|
||||
%sd = alloca < 1339 x i32>, align 8192, addrspace(5)
|
||||
%state = alloca <4 x i32>, align 16, addrspace(5)
|
||||
%rslt = call i32 @svm_eval_nodes(float addrspace(5)* %kg, <1339 x i32> addrspace(5)* %sd, <4 x i32> addrspace(5)* %state, i32 0, i32 4194304)
|
||||
%cmp = icmp eq i32 %rslt, 0
|
||||
br i1 %cmp, label %shader_eval_surface.exit, label %if.then4.i
|
||||
|
@ -34,7 +34,7 @@ entry:
|
||||
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r[[R1]]:128]
|
||||
; CHECK: vst1.32 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]!
|
||||
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]
|
||||
%retval = alloca <16 x float>, align 16
|
||||
%retval = alloca <16 x float>, align 64
|
||||
%0 = load <16 x float>, <16 x float>* @T3_retval, align 16
|
||||
store <16 x float> %0, <16 x float>* %retval
|
||||
%1 = load <16 x float>, <16 x float>* %retval
|
||||
@ -73,7 +73,7 @@ entry:
|
||||
; CHECK: vst1.64 {{{d[0-9]+}}, {{d[0-9]+}}}, [r0:128]
|
||||
|
||||
|
||||
%retval = alloca <16 x float>, align 16
|
||||
%retval = alloca <16 x float>, align 64
|
||||
%0 = load <16 x float>, <16 x float>* @T3_retval, align 16
|
||||
store <16 x float> %0, <16 x float>* %retval
|
||||
%1 = load <16 x float>, <16 x float>* %retval
|
||||
|
@ -44,18 +44,12 @@ define void @vector_f64_copy(<2 x double>* %from, <2 x double>* %to) {
|
||||
define arm_aapcs_vfpcc <16 x i8> @stack_slot_handling(<16 x i8> %a) #0 {
|
||||
; CHECK-LABEL: stack_slot_handling:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: push {r4, r6, r7, lr}
|
||||
; CHECK-NEXT: add r7, sp, #8
|
||||
; CHECK-NEXT: sub sp, #16
|
||||
; CHECK-NEXT: mov r4, sp
|
||||
; CHECK-NEXT: bfc r4, #0, #4
|
||||
; CHECK-NEXT: mov sp, r4
|
||||
; CHECK-NEXT: mov r0, sp
|
||||
; CHECK-NEXT: vstrw.32 q0, [r0]
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0]
|
||||
; CHECK-NEXT: sub.w r4, r7, #8
|
||||
; CHECK-NEXT: mov sp, r4
|
||||
; CHECK-NEXT: pop {r4, r6, r7, pc}
|
||||
; CHECK-NEXT: add sp, #16
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%a.addr = alloca <16 x i8>, align 8
|
||||
store <16 x i8> %a, <16 x i8>* %a.addr, align 8
|
||||
|
@ -53,22 +53,18 @@ entry:
|
||||
}
|
||||
|
||||
; CHECK-LABEL: _split_i64:
|
||||
; CHECK: pushl %ebp
|
||||
; CHECK: movl %esp, %ebp
|
||||
; CHECK: pushl %[[csr2:[^ ]*]]
|
||||
; CHECK: pushl %[[csr1:[^ ]*]]
|
||||
; CHECK: andl $-8, %esp
|
||||
; CHECK-DAG: movl 8(%ebp), %[[csr1]]
|
||||
; CHECK-DAG: movl 12(%ebp), %[[csr2]]
|
||||
; CHECK-DAG: leal 8(%ebp), %[[reg:[^ ]*]]
|
||||
; CHECK-DAG: movl 12(%esp), %[[csr1]]
|
||||
; CHECK-DAG: movl 16(%esp), %[[csr2]]
|
||||
; CHECK-DAG: leal 12(%esp), %[[reg:[^ ]*]]
|
||||
; CHECK: pushl %[[reg]]
|
||||
; CHECK: calll _addrof_i64
|
||||
; CHECK: addl $4, %esp
|
||||
; CHECK-DAG: movl %[[csr1]], %eax
|
||||
; CHECK-DAG: movl %[[csr2]], %edx
|
||||
; CHECK: leal -8(%ebp), %esp
|
||||
; CHECK: popl %[[csr1]]
|
||||
; CHECK: popl %[[csr2]]
|
||||
; CHECK: popl %ebp
|
||||
; CHECK: retl
|
||||
|
||||
define i1 @i1_arg(i1 %x) {
|
||||
@ -101,16 +97,13 @@ entry:
|
||||
}
|
||||
|
||||
; CHECK-LABEL: _fastcc_split_i64:
|
||||
; CHECK: pushl %ebp
|
||||
; CHECK: movl %esp, %ebp
|
||||
; CHECK-DAG: movl %edx, %[[r1:[^ ]*]]
|
||||
; CHECK-DAG: movl 8(%ebp), %[[r2:[^ ]*]]
|
||||
; CHECK-DAG: movl 20(%esp), %[[r2:[^ ]*]]
|
||||
; CHECK-DAG: movl %[[r2]], 4(%esp)
|
||||
; CHECK-DAG: movl %edx, (%esp)
|
||||
; CHECK: movl %esp, %[[reg:[^ ]*]]
|
||||
; CHECK: pushl %[[reg]]
|
||||
; CHECK: calll _addrof_i64
|
||||
; CHECK: popl %ebp
|
||||
; CHECK: retl
|
||||
|
||||
|
||||
|
@ -1164,9 +1164,9 @@ define void @isel_crash_32b(i8* %cV_R.addr) {
|
||||
; X64-NEXT: vzeroupper
|
||||
; X64-NEXT: retq
|
||||
eintry:
|
||||
%__a.addr.i = alloca <4 x i64>, align 16
|
||||
%__b.addr.i = alloca <4 x i64>, align 16
|
||||
%vCr = alloca <4 x i64>, align 16
|
||||
%__a.addr.i = alloca <4 x i64>, align 32
|
||||
%__b.addr.i = alloca <4 x i64>, align 32
|
||||
%vCr = alloca <4 x i64>, align 32
|
||||
store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
|
||||
%tmp = load <4 x i64>, <4 x i64>* %vCr, align 16
|
||||
%tmp2 = load i8, i8* %cV_R.addr, align 4
|
||||
@ -1255,9 +1255,9 @@ define void @isel_crash_16w(i16* %cV_R.addr) {
|
||||
; X64-NEXT: vzeroupper
|
||||
; X64-NEXT: retq
|
||||
eintry:
|
||||
%__a.addr.i = alloca <4 x i64>, align 16
|
||||
%__b.addr.i = alloca <4 x i64>, align 16
|
||||
%vCr = alloca <4 x i64>, align 16
|
||||
%__a.addr.i = alloca <4 x i64>, align 32
|
||||
%__b.addr.i = alloca <4 x i64>, align 32
|
||||
%vCr = alloca <4 x i64>, align 32
|
||||
store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
|
||||
%tmp = load <4 x i64>, <4 x i64>* %vCr, align 16
|
||||
%tmp2 = load i16, i16* %cV_R.addr, align 4
|
||||
@ -1346,9 +1346,9 @@ define void @isel_crash_8d(i32* %cV_R.addr) {
|
||||
; X64-NEXT: vzeroupper
|
||||
; X64-NEXT: retq
|
||||
eintry:
|
||||
%__a.addr.i = alloca <4 x i64>, align 16
|
||||
%__b.addr.i = alloca <4 x i64>, align 16
|
||||
%vCr = alloca <4 x i64>, align 16
|
||||
%__a.addr.i = alloca <4 x i64>, align 32
|
||||
%__b.addr.i = alloca <4 x i64>, align 32
|
||||
%vCr = alloca <4 x i64>, align 32
|
||||
store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
|
||||
%tmp = load <4 x i64>, <4 x i64>* %vCr, align 16
|
||||
%tmp2 = load i32, i32* %cV_R.addr, align 4
|
||||
@ -1436,9 +1436,9 @@ define void @isel_crash_4q(i64* %cV_R.addr) {
|
||||
; X64-NEXT: vzeroupper
|
||||
; X64-NEXT: retq
|
||||
eintry:
|
||||
%__a.addr.i = alloca <4 x i64>, align 16
|
||||
%__b.addr.i = alloca <4 x i64>, align 16
|
||||
%vCr = alloca <4 x i64>, align 16
|
||||
%__a.addr.i = alloca <4 x i64>, align 32
|
||||
%__b.addr.i = alloca <4 x i64>, align 32
|
||||
%vCr = alloca <4 x i64>, align 32
|
||||
store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
|
||||
%tmp = load <4 x i64>, <4 x i64>* %vCr, align 16
|
||||
%tmp2 = load i64, i64* %cV_R.addr, align 4
|
||||
|
@ -77,7 +77,7 @@ define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
|
||||
; X64-NEXT: popq %r13
|
||||
; X64-NEXT: popq %rbp
|
||||
; X64-NEXT: retq
|
||||
%y = alloca <16 x float>, align 16
|
||||
%y = alloca <16 x float>, align 64
|
||||
%x = fadd <16 x float> %a, %b
|
||||
%1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
|
||||
%2 = load <16 x float>, <16 x float>* %y, align 16
|
||||
@ -158,7 +158,7 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
|
||||
; X64-NEXT: popq %r13
|
||||
; X64-NEXT: popq %rbp
|
||||
; X64-NEXT: retq
|
||||
%y = alloca <16 x float>, align 16
|
||||
%y = alloca <16 x float>, align 64
|
||||
%x = fadd <16 x float> %a, %b
|
||||
%1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
|
||||
%2 = load <16 x float>, <16 x float>* %y, align 16
|
||||
|
@ -4,26 +4,20 @@
|
||||
define void @_start() {
|
||||
; CHECK-LABEL: _start:
|
||||
; CHECK: # %bb.0: # %Entry
|
||||
; CHECK-NEXT: pushq %rbp
|
||||
; CHECK-NEXT: pushq %rax
|
||||
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
||||
; CHECK-NEXT: .cfi_offset %rbp, -16
|
||||
; CHECK-NEXT: movq %rsp, %rbp
|
||||
; CHECK-NEXT: .cfi_def_cfa_register %rbp
|
||||
; CHECK-NEXT: andq $-128, %rsp
|
||||
; CHECK-NEXT: subq $256, %rsp # imm = 0x100
|
||||
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
|
||||
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx
|
||||
; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rax
|
||||
; CHECK-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
|
||||
; CHECK-NEXT: shrdq $2, %rcx, %rax
|
||||
; CHECK-NEXT: shrq $2, %rcx
|
||||
; CHECK-NEXT: leaq 1(,%rax,4), %rdx
|
||||
; CHECK-NEXT: movq %rdx, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: shrdq $62, %rcx, %rax
|
||||
; CHECK-NEXT: movq %rax, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: orq $-2, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movq $-1, {{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movq %rbp, %rsp
|
||||
; CHECK-NEXT: popq %rbp
|
||||
; CHECK-NEXT: .cfi_def_cfa %rsp, 8
|
||||
; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: orq $-2, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: movq $-1, -{{[0-9]+}}(%rsp)
|
||||
; CHECK-NEXT: popq %rax
|
||||
; CHECK-NEXT: .cfi_def_cfa_offset 8
|
||||
; CHECK-NEXT: retq
|
||||
Entry:
|
||||
%y = alloca <3 x i129>, align 4
|
||||
|
@ -246,7 +246,7 @@ define void @test9() optsize {
|
||||
entry:
|
||||
%p = alloca i32, align 4
|
||||
%q = alloca i32, align 4
|
||||
%s = alloca %struct.s, align 4
|
||||
%s = alloca %struct.s, align 8
|
||||
call void @good(i32 1, i32 2, i32 3, i32 4)
|
||||
%pv = ptrtoint i32* %p to i32
|
||||
%qv = ptrtoint i32* %q to i32
|
||||
@ -407,7 +407,7 @@ declare void @B_func(%struct.B* sret, %struct.B*, i32)
|
||||
define void @test14(%struct.A* %a) {
|
||||
entry:
|
||||
%ref.tmp = alloca %struct.B, align 1
|
||||
%agg.tmp = alloca i64, align 4
|
||||
%agg.tmp = alloca i64, align 8
|
||||
%tmpcast = bitcast i64* %agg.tmp to %struct.A*
|
||||
%tmp = alloca %struct.B, align 1
|
||||
%0 = bitcast %struct.A* %a to i64*
|
||||
|
Loading…
x
Reference in New Issue
Block a user