1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-24 03:33:20 +01:00

[AMDGPU] Introduce more scratch registers in the ABI.

The AMDGPU target has a convention that defined all VGPRs
(execept the initial 32 argument registers) as callee-saved.
This convention is not efficient always, esp. when the callee
requiring more registers, ended up emitting a large number of
spills, even though its caller requires only a few.

This patch revises the ABI by introducing more scratch registers
that a callee can freely use.
The 256 vgpr registers now become:
  32 argument registers
  112 scratch registers and
  112 callee saved registers.
The scratch registers and the CSRs are intermixed at regular
intervals (a split boundary of 8) to obtain a better occupancy.

Reviewers: arsenm, t-tye, rampitec, b-sumner, mjbedy, tpr

Reviewed By: arsenm, t-tye

Differential Revision: https://reviews.llvm.org/D76356
This commit is contained in:
Christudasan Devadasan 2020-03-27 03:46:51 -04:00
parent c814314f5f
commit 8553d88165
22 changed files with 399 additions and 230 deletions

View File

@ -6507,11 +6507,27 @@ On exit from a function:
* FLAT_SCRATCH
* EXEC
* GFX6-8: M0
* All SGPR and VGPR registers except the clobbered registers of SGPR4-31 and
VGPR0-31.
* All SGPR registers except the clobbered registers of SGPR4-31.
* VGPR40-47
VGPR56-63
VGPR72-79
VGPR88-95
VGPR104-111
VGPR120-127
VGPR136-143
VGPR152-159
VGPR168-175
VGPR184-191
VGPR200-207
VGPR216-223
VGPR232-239
VGPR248-255
*Except the argument registers, the VGPR cloberred and the preserved
registers are intermixed at regular intervals in order to
get a better occupancy.*
For the AMDGPU backend, an inter-procedural register allocation (IPRA)
optimization may mark some of clobbered SGPR4-31 and VGPR0-31 registers as
optimization may mark some of clobbered SGPR and VGPR registers as
preserved if it can be determined that the called function does not change
their value.

View File

@ -89,6 +89,24 @@ def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs<
(sequence "VGPR%u", 32, 255)
>;
def CSR_AMDGPU_VGPRs : CalleeSavedRegs<
// The CSRs & scratch-registers are interleaved at a split boundary of 8.
(add (sequence "VGPR%u", 40, 47),
(sequence "VGPR%u", 56, 63),
(sequence "VGPR%u", 72, 79),
(sequence "VGPR%u", 88, 95),
(sequence "VGPR%u", 104, 111),
(sequence "VGPR%u", 120, 127),
(sequence "VGPR%u", 136, 143),
(sequence "VGPR%u", 152, 159),
(sequence "VGPR%u", 168, 175),
(sequence "VGPR%u", 184, 191),
(sequence "VGPR%u", 200, 207),
(sequence "VGPR%u", 216, 223),
(sequence "VGPR%u", 232, 239),
(sequence "VGPR%u", 248, 255))
>;
def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs<
(sequence "SGPR%u", 32, 105)
>;
@ -104,7 +122,7 @@ def CSR_AMDGPU_AllAllocatableSRegs : CalleeSavedRegs<
>;
def CSR_AMDGPU_HighRegs : CalleeSavedRegs<
(add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_105)
(add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs_32_105)
>;
// Calling convention for leaf functions

View File

@ -727,9 +727,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GPRIDX-NEXT: s_mov_b32 s18, 0
; GPRIDX-NEXT: s_mov_b32 s19, 0x40200000
; GPRIDX-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GPRIDX-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GPRIDX-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill
; GPRIDX-NEXT: s_mov_b32 s17, 0x401c0000
; GPRIDX-NEXT: s_mov_b32 s16, s18
; GPRIDX-NEXT: s_mov_b32 s15, 0x40180000
@ -793,9 +790,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
; GPRIDX-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
; GPRIDX-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
; GPRIDX-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GPRIDX-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GPRIDX-NEXT: s_waitcnt vmcnt(0)
; GPRIDX-NEXT: s_setpc_b64 s[30:31]
;
@ -816,9 +810,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
; MOVREL-NEXT: s_mov_b32 s8, s18
; MOVREL-NEXT: s_mov_b64 s[6:7], 2.0
; MOVREL-NEXT: s_mov_b64 s[4:5], 1.0
; MOVREL-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; MOVREL-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; MOVREL-NEXT: buffer_store_dword v34, off, s[0:3], s32 ; 4-byte Folded Spill
; MOVREL-NEXT: v_mov_b32_e32 v34, s19
; MOVREL-NEXT: v_mov_b32_e32 v33, s18
; MOVREL-NEXT: v_mov_b32_e32 v32, s17
@ -868,10 +859,6 @@ define void @dyn_insertelement_v8f64_const_s_v_v(double %val, i32 %idx) {
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[7:10], off
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[11:14], off
; MOVREL-NEXT: global_store_dwordx4 v[0:1], v[15:18], off
; MOVREL-NEXT: buffer_load_dword v34, off, s[0:3], s32 ; 4-byte Folded Reload
; MOVREL-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; MOVREL-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; MOVREL-NEXT: s_waitcnt vmcnt(0)
; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0
; MOVREL-NEXT: s_setpc_b64 s[30:31]
entry:

View File

@ -744,17 +744,13 @@ entry:
; GCN-LABEL: {{^}}tail_call_byval_align16:
; GCN-NOT: s32
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:20
; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:16
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GCN: s_getpc_b64
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4
; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}}
; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-NOT: s32
; GCN: s_setpc_b64
define void @tail_call_byval_align16(<32 x i32> %val, double %tmp) #0 {
@ -766,15 +762,11 @@ entry:
; GCN-LABEL: {{^}}tail_call_stack_passed_arg_alignment_v32i32_f64:
; GCN-NOT: s32
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GCN: buffer_load_dword v33, off, s[0:3], s32{{$}}
; GCN: s_getpc_b64
; GCN: buffer_store_dword v33, off, s[0:3], s32{{$}}
; GCN: buffer_store_dword v32, off, s[0:3], s32 offset:4
; GCN: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-NOT: s32
; GCN: s_setpc_b64
define void @tail_call_stack_passed_arg_alignment_v32i32_f64(<32 x i32> %val, double %tmp) #0 {

View File

@ -13,15 +13,15 @@ define void @use_vcc() #1 {
}
; GCN-LABEL: {{^}}indirect_use_vcc:
; GCN: v_writelane_b32 v32, s33, 2
; GCN: v_writelane_b32 v32, s30, 0
; GCN: v_writelane_b32 v32, s31, 1
; GCN: v_writelane_b32 v40, s33, 2
; GCN: v_writelane_b32 v40, s30, 0
; GCN: v_writelane_b32 v40, s31, 1
; GCN: s_swappc_b64
; GCN: v_readlane_b32 s4, v32, 0
; GCN: v_readlane_b32 s5, v32, 1
; GCN: v_readlane_b32 s33, v32, 2
; GCN: v_readlane_b32 s4, v40, 0
; GCN: v_readlane_b32 s5, v40, 1
; GCN: v_readlane_b32 s33, v40, 2
; GCN: ; NumSgprs: 36
; GCN: ; NumVgprs: 33
; GCN: ; NumVgprs: 41
define void @indirect_use_vcc() #1 {
call void @use_vcc()
ret void
@ -32,7 +32,7 @@ define void @indirect_use_vcc() #1 {
; CI: ; NumSgprs: 38
; VI-NOBUG: ; NumSgprs: 40
; VI-BUG: ; NumSgprs: 96
; GCN: ; NumVgprs: 33
; GCN: ; NumVgprs: 41
define amdgpu_kernel void @indirect_2level_use_vcc_kernel(i32 addrspace(1)* %out) #0 {
call void @indirect_use_vcc()
ret void
@ -50,7 +50,7 @@ define void @use_flat_scratch() #1 {
; GCN-LABEL: {{^}}indirect_use_flat_scratch:
; CI: ; NumSgprs: 38
; VI: ; NumSgprs: 40
; GCN: ; NumVgprs: 33
; GCN: ; NumVgprs: 41
define void @indirect_use_flat_scratch() #1 {
call void @use_flat_scratch()
ret void
@ -61,7 +61,7 @@ define void @indirect_use_flat_scratch() #1 {
; CI: ; NumSgprs: 38
; VI-NOBUG: ; NumSgprs: 40
; VI-BUG: ; NumSgprs: 96
; GCN: ; NumVgprs: 33
; GCN: ; NumVgprs: 41
define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(i32 addrspace(1)* %out) #0 {
call void @indirect_use_flat_scratch()
ret void
@ -76,7 +76,7 @@ define void @use_10_vgpr() #1 {
}
; GCN-LABEL: {{^}}indirect_use_10_vgpr:
; GCN: ; NumVgprs: 33
; GCN: ; NumVgprs: 41
define void @indirect_use_10_vgpr() #0 {
call void @use_10_vgpr()
ret void
@ -84,23 +84,23 @@ define void @indirect_use_10_vgpr() #0 {
; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr:
; GCN: is_dynamic_callstack = 0
; GCN: ; NumVgprs: 33
; GCN: ; NumVgprs: 41
define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 {
call void @indirect_use_10_vgpr()
ret void
}
; GCN-LABEL: {{^}}use_40_vgpr:
; GCN: ; NumVgprs: 40
define void @use_40_vgpr() #1 {
call void asm sideeffect "", "~{v39}"() #0
; GCN-LABEL: {{^}}use_50_vgpr:
; GCN: ; NumVgprs: 50
define void @use_50_vgpr() #1 {
call void asm sideeffect "", "~{v49}"() #0
ret void
}
; GCN-LABEL: {{^}}indirect_use_40_vgpr:
; GCN: ; NumVgprs: 40
define void @indirect_use_40_vgpr() #0 {
call void @use_40_vgpr()
; GCN-LABEL: {{^}}indirect_use_50_vgpr:
; GCN: ; NumVgprs: 50
define void @indirect_use_50_vgpr() #0 {
call void @use_50_vgpr()
ret void
}

View File

@ -23,22 +23,22 @@ define amdgpu_kernel void @test_kernel_call_external_void_func_void_clobber_s30_
; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void:
; GCN: buffer_store_dword
; GCN: v_writelane_b32 v32, s33, 4
; GCN: v_writelane_b32 v32, s34, 0
; GCN: v_writelane_b32 v32, s35, 1
; GCN: v_writelane_b32 v32, s30, 2
; GCN: v_writelane_b32 v32, s31, 3
; GCN: v_writelane_b32 v40, s33, 4
; GCN: v_writelane_b32 v40, s34, 0
; GCN: v_writelane_b32 v40, s35, 1
; GCN: v_writelane_b32 v40, s30, 2
; GCN: v_writelane_b32 v40, s31, 3
; GCN: s_swappc_b64
; GCN-NEXT: ;;#ASMSTART
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_swappc_b64
; GCN-DAG: v_readlane_b32 s4, v32, 2
; GCN-DAG: v_readlane_b32 s5, v32, 3
; GCN: v_readlane_b32 s35, v32, 1
; GCN: v_readlane_b32 s34, v32, 0
; GCN-DAG: v_readlane_b32 s4, v40, 2
; GCN-DAG: v_readlane_b32 s5, v40, 3
; GCN: v_readlane_b32 s35, v40, 1
; GCN: v_readlane_b32 s34, v40, 0
; GCN: v_readlane_b32 s33, v32, 4
; GCN: v_readlane_b32 s33, v40, 4
; GCN: buffer_load_dword
; GCN: s_setpc_b64
define void @test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void() #0 {
@ -49,16 +49,16 @@ define void @test_func_call_external_void_func_void_clobber_s30_s31_call_externa
}
; GCN-LABEL: {{^}}test_func_call_external_void_funcx2:
; GCN: buffer_store_dword v32
; GCN: v_writelane_b32 v32, s33, 4
; GCN: buffer_store_dword v40
; GCN: v_writelane_b32 v40, s33, 4
; GCN: s_mov_b32 s33, s32
; GCN: s_add_u32 s32, s32, 0x400
; GCN: s_swappc_b64
; GCN-NEXT: s_swappc_b64
; GCN: v_readlane_b32 s33, v32, 4
; GCN: buffer_load_dword v32,
; GCN: v_readlane_b32 s33, v40, 4
; GCN: buffer_load_dword v40,
define void @test_func_call_external_void_funcx2() #0 {
call void @external_void_func_void()
call void @external_void_func_void()
@ -115,9 +115,9 @@ define amdgpu_kernel void @test_call_void_func_void_mayclobber_s31(i32 addrspace
}
; GCN-LABEL: {{^}}test_call_void_func_void_mayclobber_v31:
; GCN: v_mov_b32_e32 v32, v31
; GCN: v_mov_b32_e32 v40, v31
; GCN-NEXT: s_swappc_b64
; GCN-NEXT: v_mov_b32_e32 v31, v32
; GCN-NEXT: v_mov_b32_e32 v31, v40
define amdgpu_kernel void @test_call_void_func_void_mayclobber_v31(i32 addrspace(1)* %out) #0 {
%v31 = call i32 asm sideeffect "; def $0", "={v31}"()
call void @external_void_func_void()
@ -177,31 +177,31 @@ define amdgpu_kernel void @test_call_void_func_void_preserves_s34(i32 addrspace(
ret void
}
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v32: {{.*}}
; GCN-LABEL: {{^}}test_call_void_func_void_preserves_v40: {{.*}}
; GCN-NOT: v32
; GCN: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, external_void_func_void@rel32@hi+4
; GCN: s_mov_b32 s32, 0
; GCN-NOT: v32
; GCN-NOT: v40
; GCN: ;;#ASMSTART
; GCN-NEXT: ; def v32
; GCN-NEXT: ; def v40
; GCN-NEXT: ;;#ASMEND
; GCN: s_swappc_b64 s[30:31], s[4:5]
; GCN-NOT: v32
; GCN-NOT: v40
; GCN: ;;#ASMSTART
; GCN-NEXT: ; use v32
; GCN-NEXT: ; use v40
; GCN-NEXT: ;;#ASMEND
; GCN-NEXT: s_endpgm
define amdgpu_kernel void @test_call_void_func_void_preserves_v32(i32 addrspace(1)* %out) #0 {
%v32 = call i32 asm sideeffect "; def $0", "={v32}"()
define amdgpu_kernel void @test_call_void_func_void_preserves_v40(i32 addrspace(1)* %out) #0 {
%v40 = call i32 asm sideeffect "; def $0", "={v40}"()
call void @external_void_func_void()
call void asm sideeffect "; use $0", "{v32}"(i32 %v32)
call void asm sideeffect "; use $0", "{v40}"(i32 %v40)
ret void
}
@ -255,12 +255,12 @@ define amdgpu_kernel void @test_call_void_func_void_clobber_s34() #0 {
; GCN-LABEL: {{^}}callee_saved_sgpr_func:
; GCN-NOT: s40
; GCN: v_writelane_b32 v32, s40
; GCN: v_writelane_b32 v40, s40
; GCN: s_swappc_b64
; GCN-NOT: s40
; GCN: ; use s40
; GCN-NOT: s40
; GCN: v_readlane_b32 s40, v32
; GCN: v_readlane_b32 s40, v40
; GCN-NOT: s40
define void @callee_saved_sgpr_func() #2 {
%s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
@ -287,19 +287,19 @@ define amdgpu_kernel void @callee_saved_sgpr_kernel() #2 {
; First call preserved VGPR is used so it can't be used for SGPR spills.
; GCN-LABEL: {{^}}callee_saved_sgpr_vgpr_func:
; GCN-NOT: s40
; GCN: v_writelane_b32 v33, s40
; GCN: v_writelane_b32 v41, s40
; GCN: s_swappc_b64
; GCN-NOT: s40
; GCN: ; use s40
; GCN-NOT: s40
; GCN: v_readlane_b32 s40, v33
; GCN: v_readlane_b32 s40, v41
; GCN-NOT: s40
define void @callee_saved_sgpr_vgpr_func() #2 {
%s40 = call i32 asm sideeffect "; def s40", "={s40}"() #0
%v32 = call i32 asm sideeffect "; def v32", "={v32}"() #0
%v40 = call i32 asm sideeffect "; def v40", "={v40}"() #0
call void @external_void_func_void()
call void asm sideeffect "; use $0", "s"(i32 %s40) #0
call void asm sideeffect "; use $0", "v"(i32 %v32) #0
call void asm sideeffect "; use $0", "v"(i32 %v40) #0
ret void
}

View File

@ -64,11 +64,11 @@ define amdgpu_kernel void @call_no_wait_after_call(i32 addrspace(1)* %ptr, i32)
; GCN-NEXT: s_add_u32 s4, s4, func@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func@rel32@hi+4
; GCN-NEXT: s_mov_b32 s32, 0
; GCN-NEXT: v_mov_b32_e32 v32, 0
; GCN-NEXT: v_mov_b32_e32 v40, 0
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_mov_b32_e32 v0, s34
; GCN-NEXT: v_mov_b32_e32 v1, s35
; GCN-NEXT: global_store_dword v[0:1], v32, off
; GCN-NEXT: global_store_dword v[0:1], v40, off
; GCN-NEXT: s_endpgm
call void @func(i32 0)
store i32 0, i32 addrspace(1)* %ptr

View File

@ -127,8 +127,8 @@ define void @callee_with_stack_and_call() #0 {
; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
; GCN: s_swappc_b64
; GCN-DAG: v_readlane_b32 s4, v32, 0
; GCN-DAG: v_readlane_b32 s5, v32, 1
; GCN-DAG: v_readlane_b32 s4, v40, 0
; GCN-DAG: v_readlane_b32 s5, v40, 1
; GCN: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], [[FP_SPILL_LANE]]
@ -168,6 +168,7 @@ define void @callee_func_sgpr_spill_no_calls(i32 %in) #0 {
call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0
call void asm sideeffect "", "~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23}"() #0
call void asm sideeffect "", "~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() #0
call void asm sideeffect "", "~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}"() #0
%wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
%wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
@ -207,14 +208,14 @@ define void @spill_only_csr_sgpr() {
; GCN-NEXT:s_mov_b32 [[FP_COPY:s[0-9]+]], s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
; GCN-DAG: buffer_store_dword v33, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33 offset:8
; GCN: ;;#ASMSTART
; GCN-NEXT: ; clobber v33
; GCN-NEXT: ; clobber v41
; GCN-NEXT: ;;#ASMEND
; GCN: buffer_load_dword v33, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN: s_add_u32 s32, s32, 0x300
; GCN-NEXT: s_sub_u32 s32, s32, 0x300
; GCN-NEXT: s_mov_b32 s33, s4
@ -223,7 +224,7 @@ define void @spill_only_csr_sgpr() {
define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
%alloca = alloca i32, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %alloca
call void asm sideeffect "; clobber v33", "~{v33}"()
call void asm sideeffect "; clobber v41", "~{v41}"()
ret void
}
@ -232,7 +233,7 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
; GCN: s_waitcnt
; GCN-NEXT: v_writelane_b32 v1, s33, 63
; GCN-NEXT: s_mov_b32 s33, s32
; GCN: buffer_store_dword v33, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-COUNT-63: v_writelane_b32 v1
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:8
; GCN: ;;#ASMSTART
@ -246,7 +247,7 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
define void @last_lane_vgpr_for_fp_csr() #1 {
%alloca = alloca i32, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %alloca
call void asm sideeffect "; clobber v33", "~{v33}"()
call void asm sideeffect "; clobber v41", "~{v41}"()
call void asm sideeffect "",
"~{s40},~{s41},~{s42},~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49}
,~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s56},~{s57},~{s58},~{s59}
@ -264,14 +265,14 @@ define void @last_lane_vgpr_for_fp_csr() #1 {
; GCN: s_waitcnt
; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: buffer_store_dword v33, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-COUNT-64: v_writelane_b32 v1,
; GCN: buffer_store_dword
; GCN: ;;#ASMSTART
; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1
; GCN: buffer_load_dword v33, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN: s_add_u32 s32, s32, 0x300
; GCN-NEXT: s_sub_u32 s32, s32, 0x300
; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
@ -280,7 +281,7 @@ define void @last_lane_vgpr_for_fp_csr() #1 {
define void @no_new_vgpr_for_fp_csr() #1 {
%alloca = alloca i32, addrspace(5)
store volatile i32 0, i32 addrspace(5)* %alloca
call void asm sideeffect "; clobber v33", "~{v33}"()
call void asm sideeffect "; clobber v41", "~{v41}"()
call void asm sideeffect "",
"~{s39},~{s40},~{s41},~{s42},~{s43},~{s44},~{s45},~{s46},~{s47},~{s48},~{s49}
,~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s56},~{s57},~{s58},~{s59}
@ -347,20 +348,20 @@ define void @no_unused_non_csr_sgpr_for_fp() #1 {
; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
; GCN-NEXT: v_writelane_b32 v32, s33, 2
; GCN-NEXT: v_writelane_b32 v32, s30, 0
; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2
; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-DAG: v_writelane_b32 v32, s31, 1
; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
; GCN-DAG: buffer_store_dword
; GCN: s_add_u32 s32, s32, 0x300{{$}}
; GCN: ;;#ASMSTART
; GCN: v_readlane_b32 s4, v32, 0
; GCN-NEXT: v_readlane_b32 s5, v32, 1
; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0
; GCN-NEXT: v_readlane_b32 s5, [[CSR_VGPR]], 1
; GCN-NEXT: s_sub_u32 s32, s32, 0x300{{$}}
; GCN-NEXT: v_readlane_b32 s33, v32, 2
; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
@ -377,11 +378,11 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29}
,~{s30},~{s31}"() #0
call void asm sideeffect "; clobber nonpreserved VGPRs",
call void asm sideeffect "; clobber nonpreserved initial VGPRs",
"~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
,~{v30},~{v31}"() #1
,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}"() #1
ret void
}
@ -394,19 +395,19 @@ define void @no_unused_non_csr_sgpr_for_fp_no_scratch_vgpr() #1 {
; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008
; GCN-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
; GCN-NEXT: v_writelane_b32 v32, s33, 2
; GCN-NEXT: v_writelane_b32 v32, s30, 0
; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s33, 2
; GCN-NEXT: v_writelane_b32 [[CSR_VGPR]], s30, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-DAG: v_writelane_b32 v32, s31, 1
; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1
; GCN-DAG: s_add_u32 s32, s32, 0x40300{{$}}
; GCN-DAG: buffer_store_dword
; GCN: ;;#ASMSTART
; GCN: v_readlane_b32 s4, v32, 0
; GCN-NEXT: v_readlane_b32 s5, v32, 1
; GCN: v_readlane_b32 s4, [[CSR_VGPR]], 0
; GCN-NEXT: v_readlane_b32 s5, [[CSR_VGPR]], 1
; GCN-NEXT: s_sub_u32 s32, s32, 0x40300{{$}}
; GCN-NEXT: v_readlane_b32 s33, v32, 2
; GCN-NEXT: v_readlane_b32 s33, [[CSR_VGPR]], 2
; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN-NEXT: v_mov_b32_e32 [[SCRATCH_VGPR:v[0-9]+]], 0x1008
; GCN-NEXT: buffer_load_dword [[CSR_VGPR]], [[SCRATCH_VGPR]], s[0:3], s32 offen ; 4-byte Folded Reload
@ -429,7 +430,7 @@ define void @scratch_reg_needed_mubuf_offset([4096 x i8] addrspace(5)* byval ali
"~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
,~{v30},~{v31}"() #1
,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}"() #1
ret void
}

View File

@ -144,7 +144,7 @@ define hidden void @use_workgroup_id_yz() #1 {
; GCN-NOT: s12
; GCN-NOT: s13
; GCN-NOT: s14
; GCN: v_readlane_b32 s4, v32, 0
; GCN: v_readlane_b32 s4, v40, 0
define hidden void @func_indirect_use_workgroup_id_x() #1 {
call void @use_workgroup_id_x()
ret void
@ -152,7 +152,7 @@ define hidden void @func_indirect_use_workgroup_id_x() #1 {
; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_y:
; GCN-NOT: s4
; GCN: v_readlane_b32 s4, v32, 0
; GCN: v_readlane_b32 s4, v40, 0
define hidden void @func_indirect_use_workgroup_id_y() #1 {
call void @use_workgroup_id_y()
ret void
@ -160,7 +160,7 @@ define hidden void @func_indirect_use_workgroup_id_y() #1 {
; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_z:
; GCN-NOT: s4
; GCN: v_readlane_b32 s4, v32, 0
; GCN: v_readlane_b32 s4, v40, 0
define hidden void @func_indirect_use_workgroup_id_z() #1 {
call void @use_workgroup_id_z()
ret void

View File

@ -302,7 +302,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_yz() #1 {
; Argument is in right place already
; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_x:
; GCN-NOT: s4
; GCN: v_readlane_b32 s4, v32, 0
; GCN: v_readlane_b32 s4, v40, 0
define hidden void @func_indirect_use_workgroup_id_x() #1 {
call void @use_workgroup_id_x()
ret void
@ -310,7 +310,7 @@ define hidden void @func_indirect_use_workgroup_id_x() #1 {
; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_y:
; GCN-NOT: s4
; GCN: v_readlane_b32 s4, v32, 0
; GCN: v_readlane_b32 s4, v40, 0
define hidden void @func_indirect_use_workgroup_id_y() #1 {
call void @use_workgroup_id_y()
ret void
@ -318,7 +318,7 @@ define hidden void @func_indirect_use_workgroup_id_y() #1 {
; GCN-LABEL: {{^}}func_indirect_use_workgroup_id_z:
; GCN-NOT: s4
; GCN: v_readlane_b32 s4, v32, 0
; GCN: v_readlane_b32 s4, v40, 0
define hidden void @func_indirect_use_workgroup_id_z() #1 {
call void @use_workgroup_id_z()
ret void

View File

@ -396,13 +396,11 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workitem_id_z() #1 {
}
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x:
; VARABI: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}}
; VARABI: v_and_b32_e32 v32, 0x3ff, v32
; VARABI: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, v32
; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; VARABI-NEXT: s_waitcnt
; VARABI: s_waitcnt
; VARABI-NEXT: s_setpc_b64
; FIXEDABI: v_and_b32_e32 v31, 0x3ff, v31
@ -514,15 +512,15 @@ define void @func_call_too_many_args_use_workitem_id_x(i32 %arg0) #1 {
; Requires loading and storing to stack slot.
; GCN-LABEL: {{^}}too_many_args_call_too_many_args_use_workitem_id_x:
; GCN-DAG: s_add_u32 s32, s32, 0x400{{$}}
; GCN-DAG: buffer_store_dword v32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-DAG: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-DAG: buffer_load_dword v32, off, s[0:3], s33{{$}}
; GCN: buffer_store_dword v32, off, s[0:3], s32{{$}}
; GCN: s_swappc_b64
; GCN: buffer_load_dword v32, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN: s_sub_u32 s32, s32, 0x400{{$}}
; GCN: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN: s_setpc_b64
define void @too_many_args_call_too_many_args_use_workitem_id_x(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
@ -543,13 +541,11 @@ define void @too_many_args_call_too_many_args_use_workitem_id_x(
; frame[2] = VGPR spill slot
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_x_byval:
; VARABI: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:4
; VARABI-NEXT: s_waitcnt
; VARABI-NEXT: v_and_b32_e32 v32, 0x3ff, v32
; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, v32
; VARABI: buffer_load_dword v0, off, s[0:3], s32{{$}}
; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; VARABI: s_setpc_b64
@ -700,10 +696,7 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
ret void
}
; Only one stack load should be emitted for all 3 values.
; GCN-LABEL: {{^}}too_many_args_use_workitem_id_xyz:
; VARABI: buffer_store_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; VARABI: buffer_store_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; VARABI-NOT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32{{$}}
; VARABI: buffer_load_dword v32, off, s[0:3], s32{{$}}
; VARABI-NOT: buffer_load_dword
@ -717,9 +710,7 @@ define void @func_call_too_many_args_use_workitem_id_x_byval() #1 {
; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Y]]
; VARABI-NEXT: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+]}}, [[BFE_Z]]
; VARABI: buffer_load_dword v33, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; VARABI: buffer_load_dword v32, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; VARABI-NEXT: s_waitcnt
; VARABI: s_waitcnt
; VARABI-NEXT: s_setpc_b64
@ -826,7 +817,7 @@ define amdgpu_kernel void @kern_call_too_many_args_use_workitem_id_xyz() #1 {
; GCN: s_waitcnt
; GCN-NEXT: s_setpc_b64
; GCN: ScratchSize: 8
; GCN: ScratchSize: 0
define void @too_many_args_use_workitem_id_x_stack_yz(
i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7,
i32 %arg8, i32 %arg9, i32 %arg10, i32 %arg11, i32 %arg12, i32 %arg13, i32 %arg14, i32 %arg15,

View File

@ -28,23 +28,23 @@ define float @call_split_type_used_outside_block_v2f32() #0 {
; GCN: ; %bb.0: ; %bb0
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_writelane_b32 v32, s33, 2
; GCN-NEXT: v_writelane_b32 v32, s30, 0
; GCN-NEXT: v_writelane_b32 v40, s33, 2
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, func_v2f32@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func_v2f32@rel32@hi+4
; GCN-NEXT: v_writelane_b32 v32, s31, 1
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_readlane_b32 s4, v32, 0
; GCN-NEXT: v_readlane_b32 s5, v32, 1
; GCN-NEXT: v_readlane_b32 s4, v40, 0
; GCN-NEXT: v_readlane_b32 s5, v40, 1
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: v_readlane_b32 s33, v32, 2
; GCN-NEXT: v_readlane_b32 s33, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[4:5]
@ -62,23 +62,23 @@ define float @call_split_type_used_outside_block_v3f32() #0 {
; GCN: ; %bb.0: ; %bb0
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_writelane_b32 v32, s33, 2
; GCN-NEXT: v_writelane_b32 v32, s30, 0
; GCN-NEXT: v_writelane_b32 v40, s33, 2
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, func_v3f32@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func_v3f32@rel32@hi+4
; GCN-NEXT: v_writelane_b32 v32, s31, 1
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_readlane_b32 s4, v32, 0
; GCN-NEXT: v_readlane_b32 s5, v32, 1
; GCN-NEXT: v_readlane_b32 s4, v40, 0
; GCN-NEXT: v_readlane_b32 s5, v40, 1
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: v_readlane_b32 s33, v32, 2
; GCN-NEXT: v_readlane_b32 s33, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[4:5]
@ -96,23 +96,23 @@ define half @call_split_type_used_outside_block_v4f16() #0 {
; GCN: ; %bb.0: ; %bb0
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_writelane_b32 v32, s33, 2
; GCN-NEXT: v_writelane_b32 v32, s30, 0
; GCN-NEXT: v_writelane_b32 v40, s33, 2
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, func_v4f16@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func_v4f16@rel32@hi+4
; GCN-NEXT: v_writelane_b32 v32, s31, 1
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_readlane_b32 s4, v32, 0
; GCN-NEXT: v_readlane_b32 s5, v32, 1
; GCN-NEXT: v_readlane_b32 s4, v40, 0
; GCN-NEXT: v_readlane_b32 s5, v40, 1
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: v_readlane_b32 s33, v32, 2
; GCN-NEXT: v_readlane_b32 s33, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[4:5]
@ -130,24 +130,24 @@ define { i32, half } @call_split_type_used_outside_block_struct() #0 {
; GCN: ; %bb.0: ; %bb0
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: v_writelane_b32 v32, s33, 2
; GCN-NEXT: v_writelane_b32 v32, s30, 0
; GCN-NEXT: v_writelane_b32 v40, s33, 2
; GCN-NEXT: v_writelane_b32 v40, s30, 0
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_add_u32 s32, s32, 0x400
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, func_struct@rel32@lo+4
; GCN-NEXT: s_addc_u32 s5, s5, func_struct@rel32@hi+4
; GCN-NEXT: v_writelane_b32 v32, s31, 1
; GCN-NEXT: v_writelane_b32 v40, s31, 1
; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GCN-NEXT: v_readlane_b32 s4, v32, 0
; GCN-NEXT: v_readlane_b32 s5, v32, 1
; GCN-NEXT: v_readlane_b32 s4, v40, 0
; GCN-NEXT: v_readlane_b32 s5, v40, 1
; GCN-NEXT: v_mov_b32_e32 v1, v4
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: v_readlane_b32 s33, v32, 2
; GCN-NEXT: v_readlane_b32 s33, v40, 2
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[4:5]

View File

@ -3,7 +3,7 @@
; CHECK-DAG: csr Clobbered Registers: $vgpr0 $vgpr0_hi16 $vgpr0_lo16 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1 $vgpr0_vgpr1_vgpr2 {{$}}
define void @csr() #0 {
call void asm sideeffect "", "~{v0},~{v36},~{v37}"() #0
call void asm sideeffect "", "~{v0},~{v44},~{v45}"() #0
ret void
}

View File

@ -187,44 +187,44 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1
; GFX9-NEXT: buffer_store_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-NEXT: v_writelane_b32 v35, s33, 4
; GFX9-NEXT: v_writelane_b32 v43, s33, 4
; GFX9-NEXT: s_mov_b32 s33, s32
; GFX9-NEXT: s_add_u32 s32, s32, 0x800
; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v34, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v35, s34, 0
; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9-NEXT: v_writelane_b32 v43, s34, 0
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, foo@gotpcrel32@hi+4
; GFX9-NEXT: v_writelane_b32 v35, s35, 1
; GFX9-NEXT: v_writelane_b32 v43, s35, 1
; GFX9-NEXT: s_load_dwordx2 s[34:35], s[4:5], 0x0
; GFX9-NEXT: v_mov_b32_e32 v32, v1
; GFX9-NEXT: v_mov_b32_e32 v33, v0
; GFX9-NEXT: v_writelane_b32 v35, s30, 2
; GFX9-NEXT: v_mul_u32_u24_e32 v0, v33, v32
; GFX9-NEXT: v_writelane_b32 v35, s31, 3
; GFX9-NEXT: v_and_b32_e32 v34, 0xffffff, v32
; GFX9-NEXT: v_mov_b32_e32 v40, v1
; GFX9-NEXT: v_mov_b32_e32 v41, v0
; GFX9-NEXT: v_writelane_b32 v43, s30, 2
; GFX9-NEXT: v_mul_u32_u24_e32 v0, v41, v40
; GFX9-NEXT: v_writelane_b32 v43, s31, 3
; GFX9-NEXT: v_and_b32_e32 v42, 0xffffff, v40
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_mad_u32_u24 v32, v33, v32, v34
; GFX9-NEXT: v_mov_b32_e32 v0, v32
; GFX9-NEXT: v_mad_u32_u24 v40, v41, v40, v42
; GFX9-NEXT: v_mov_b32_e32 v0, v40
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_add_u32_e32 v0, v32, v34
; GFX9-NEXT: v_add_u32_e32 v0, v40, v42
; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-NEXT: v_readlane_b32 s4, v35, 2
; GFX9-NEXT: v_readlane_b32 s5, v35, 3
; GFX9-NEXT: v_readlane_b32 s35, v35, 1
; GFX9-NEXT: v_readlane_b32 s34, v35, 0
; GFX9-NEXT: buffer_load_dword v34, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT: v_readlane_b32 s4, v43, 2
; GFX9-NEXT: v_readlane_b32 s5, v43, 3
; GFX9-NEXT: v_readlane_b32 s35, v43, 1
; GFX9-NEXT: v_readlane_b32 s34, v43, 0
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT: s_sub_u32 s32, s32, 0x800
; GFX9-NEXT: v_readlane_b32 s33, v35, 4
; GFX9-NEXT: v_readlane_b32 s33, v43, 4
; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX9-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: s_mov_b64 exec, s[6:7]
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[4:5]

View File

@ -12,23 +12,23 @@ declare void @external_void_func_i32(i32) #0
; Spill CSR VGPR used for SGPR spilling
; GCN: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
; GCN-DAG: v_writelane_b32 v32, s33, 2
; GCN-DAG: v_writelane_b32 v40, s33, 2
; GCN-DAG: s_mov_b32 s33, s32
; GCN-DAG: s_add_u32 s32, s32, 0x400
; GCN-DAG: v_writelane_b32 v32, s30, 0
; GCN-DAG: v_writelane_b32 v32, s31, 1
; GCN-DAG: v_writelane_b32 v40, s30, 0
; GCN-DAG: v_writelane_b32 v40, s31, 1
; GCN: s_swappc_b64
; GCN: v_readlane_b32 s4, v32, 0
; GCN: v_readlane_b32 s5, v32, 1
; GCN: v_readlane_b32 s4, v40, 0
; GCN: v_readlane_b32 s5, v40, 1
; GCN-NEXT: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: v_readlane_b32 s33, v32, 2
; GCN-NEXT: v_readlane_b32 s33, v40, 2
; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[4:5]

View File

@ -254,7 +254,7 @@ body: |
...
# GCN-LABEL: csr{{$}}
# GCN: V_AND_B32_e32 $vgpr4, $vgpr0,
# GCN: V_AND_B32_e32 $vgpr37, $vgpr0,
---
name: csr
tracksRegLiveness: true

View File

@ -152,9 +152,6 @@ define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %l
; FIXME: Why load and store same location for stack args?
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32:
; GCN-DAG: buffer_store_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
; GCN-DAG: buffer_store_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-DAG: buffer_load_dword [[LOAD_0:v[0-9]+]], off, s[0:3], s32{{$}}
; GCN-DAG: buffer_load_dword [[LOAD_1:v[0-9]+]], off, s[0:3], s32 offset:4
@ -163,9 +160,6 @@ define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %l
; GCN-DAG: buffer_store_dword [[LOAD_0]], off, s[0:3], s32{{$}}
; GCN-DAG: buffer_store_dword [[LOAD_1]], off, s[0:3], s32 offset:4
; GCN-DAG: buffer_load_dword v32, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
; GCN-DAG: buffer_load_dword v33, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-NOT: s32
; GCN: s_setpc_b64
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
@ -176,7 +170,7 @@ entry:
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
; GCN-DAG: v_mov_b32_e32 [[NINE:v[0-9]+]], 9
; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:40
; GCN: buffer_store_dword [[NINE]], off, s[0:3], s32 offset:28
; GCN: s_setpc_b64
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i32 %b, [32 x i32] %c) #1 {
entry:
@ -203,15 +197,15 @@ entry:
; Have another non-tail in the function
; GCN-LABEL: {{^}}sibling_call_i32_fastcc_i32_i32_other_call:
; GCN: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
; GCN-NEXT: buffer_store_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec
; GCN: s_mov_b32 s33, s32
; GCN-DAG: s_add_u32 s32, s32, 0x400
; GCN-DAG: buffer_store_dword v32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-DAG: buffer_store_dword v33, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-DAG: v_writelane_b32 v34, s34, 0
; GCN-DAG: v_writelane_b32 v34, s35, 1
; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
; GCN-DAG: v_writelane_b32 v42, s34, 0
; GCN-DAG: v_writelane_b32 v42, s35, 1
; GCN-DAG: s_getpc_b64 s[4:5]
; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4
@ -220,11 +214,11 @@ entry:
; GCN: s_swappc_b64
; GCN-DAG: v_readlane_b32 s34, v34, 0
; GCN-DAG: v_readlane_b32 s35, v34, 1
; GCN-DAG: v_readlane_b32 s34, v42, 0
; GCN-DAG: v_readlane_b32 s35, v42, 1
; GCN: buffer_load_dword v33, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN: buffer_load_dword v32, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
; GCN: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GCN: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
@ -233,7 +227,7 @@ entry:
; GCN: s_sub_u32 s32, s32, 0x400
; GCN-NEXT: v_readlane_b32 s33,
; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
; GCN-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[6:7]
; GCN-NEXT: s_setpc_b64 s[4:5]
define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 {
@ -248,11 +242,11 @@ entry:
; GCN-LABEL: {{^}}sibling_call_stack_objecti32_fastcc_i32_i32_a32i32:
; GCN-NOT: s33
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
; GCN-NOT: s33
; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 offset:
; GCN: s_setpc_b64 s[4:5]
define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c) #1 {
entry:

View File

@ -2,17 +2,17 @@
; GCN-LABEL: {{^}}spill_csr_s5_copy:
; GCN: s_or_saveexec_b64
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; GCN-NEXT: s_mov_b64 exec
; GCN: v_writelane_b32 v32, s33, 2
; GCN: v_writelane_b32 v40, s33, 2
; GCN: s_swappc_b64
; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9
; GCN: buffer_store_dword [[K]], off, s[0:3], s33{{$}}
; GCN: v_readlane_b32 s33, v32, 2
; GCN: v_readlane_b32 s33, v40, 2
; GCN: s_or_saveexec_b64
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; GCN: s_mov_b64 exec
; GCN: s_setpc_b64
define void @spill_csr_s5_copy() #0 {

View File

@ -29,7 +29,7 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <
; GCN-NEXT: s_mov_b64 s[0:1], s[36:37]
; GCN-NEXT: s_mov_b64 s[2:3], s[38:39]
; GCN-NEXT: s_mov_b32 s32, 0xc0000
; GCN-NEXT: v_add_nc_u32_e64 v32, 4, 0x4000
; GCN-NEXT: v_add_nc_u32_e64 v40, 4, 0x4000
; GCN-NEXT: ; implicit-def: $vcc_hi
; GCN-NEXT: s_getpc_b64 s[4:5]
; GCN-NEXT: s_add_u32 s4, s4, svm_eval_nodes@rel32@lo+4
@ -41,8 +41,8 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <
; GCN-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GCN-NEXT: s_cbranch_execz BB0_2
; GCN-NEXT: ; %bb.1: ; %if.then4.i
; GCN-NEXT: buffer_load_dword v0, v32, s[36:39], s32 offen
; GCN-NEXT: buffer_load_dword v1, v32, s[36:39], s32 offen offset:4
; GCN-NEXT: buffer_load_dword v0, v40, s[36:39], s32 offen
; GCN-NEXT: buffer_load_dword v1, v40, s[36:39], s32 offen offset:4
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_nc_u32_e32 v0, v1, v0
; GCN-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0

View File

@ -0,0 +1,170 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s
declare void @extern_func()
define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs not be
; preserved across the call and should get 8 scratch registers.
; GFX9-LABEL: non_preserved_vgpr_tuple8:
; GFX9: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX9: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9: v_mov_b32_e32 v37, v11
; GFX9-NEXT: v_mov_b32_e32 v38, v10
; GFX9-NEXT: v_mov_b32_e32 v49, v9
; GFX9-NEXT: v_writelane_b32 v44, s30, 0
; GFX9-NEXT: v_mov_b32_e32 v36, v16
; GFX9-NEXT: v_mov_b32_e32 v35, v15
; GFX9-NEXT: v_mov_b32_e32 v34, v14
; GFX9-NEXT: v_mov_b32_e32 v33, v13
; GFX9-NEXT: v_mov_b32_e32 v32, v12
; GFX9: ;;#ASMSTART
; GFX9-NEXT: ;;#ASMEND
; GFX9: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[12:15] dmask:0x1
; GFX9-NEXT: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GFX9: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX9: s_setpc_b64 s[4:5]
;
; GFX10-LABEL: non_preserved_vgpr_tuple8:
; GFX10: buffer_store_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
; GFX10: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10: v_mov_b32_e32 v36, v16
; GFX10-NEXT: v_mov_b32_e32 v35, v15
; GFX10-NEXT: v_mov_b32_e32 v34, v14
; GFX10-NEXT: v_mov_b32_e32 v33, v13
; GFX10-NEXT: v_mov_b32_e32 v32, v12
; GFX10: ;;#ASMSTART
; GFX10-NEXT: ;;#ASMEND
; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[12:15] dmask:0x1
; GFX10-NEXT: v_nop
; GFX10-NEXT: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10: buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GFX10: buffer_load_dword v44, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
; GFX10: s_setpc_b64 s[4:5]
main_body:
call void asm sideeffect "", "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7}"() #0
call void asm sideeffect "", "~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15}"() #0
call void asm sideeffect "", "~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23}"() #0
call void asm sideeffect "", "~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"() #0
%v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
call void @extern_func()
ret <4 x float> %v
}
define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp, float %bias, float %zcompare, float %s, float %t, float %clamp) {
; The vgpr tuple8 operand in image_gather4_c_b_cl instruction needs to be preserved
; across the call and should get allcoated to 8 CSRs.
; Only the lower 5 sub-registers of the tuple are preserved.
; The upper 3 sub-registers are unused.
; GFX9-LABEL: call_preserved_vgpr_tuple8:
; GFX9: buffer_store_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX9: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX9: v_mov_b32_e32 v44, v16
; GFX9-NEXT: v_mov_b32_e32 v43, v15
; GFX9-NEXT: v_mov_b32_e32 v42, v14
; GFX9-NEXT: v_mov_b32_e32 v41, v13
; GFX9-NEXT: v_mov_b32_e32 v40, v12
; GFX9: s_getpc_b64 s[4:5]
; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4
; GFX9-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[44:47] dmask:0x1
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[40:47], s[36:43], s[44:47] dmask:0x1
; GFX9: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
; GFX9: buffer_load_dword v56, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX9: s_setpc_b64 s[4:5]
;
; GFX10-LABEL: call_preserved_vgpr_tuple8:
; GFX10: buffer_store_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
; GFX10: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
; GFX10: s_getpc_b64 s[4:5]
; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4
; GFX10-NEXT: v_mov_b32_e32 v40, v16
; GFX10-NEXT: v_mov_b32_e32 v41, v15
; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:19], s[36:43], s[44:47] dmask:0x1
; GFX10-NEXT: v_mov_b32_e32 v42, v14
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; GFX10-NEXT: v_mov_b32_e32 v43, v13
; GFX10-NEXT: v_mov_b32_e32 v44, v12
; GFX10-NEXT: ; implicit-def: $vcc_hi
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5]
; GFX10-NEXT: image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[36:43], s[44:47] dmask:0x1
; GFX10: buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
; GFX10: buffer_load_dword v45, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
; GFX10: s_setpc_b64 s[4:5]
main_body:
%v = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
store <4 x float> %v, <4 x float> addrspace(1)* undef
call void @extern_func()
%v1 = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 1, float %bias, float %zcompare, float %s, float %t, float %clamp, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 0, i32 0)
ret <4 x float> %v1
}
declare <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.2d.v4f32.f32.f32(i32 immarg, float, float, float, float, float, <8 x i32>, <4 x i32>, i1 immarg, i32 immarg, i32 immarg) #1
attributes #0 = { nounwind writeonly }
attributes #1 = { nounwind readonly }

View File

@ -29,7 +29,7 @@ machineFunctionInfo:
body: |
bb.0:
; CHECK-LABEL: name: undef_identity_copy
; CHECK: renamable $vgpr32_vgpr33_vgpr34_vgpr35 = FLAT_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, addrspace 1)
; CHECK: renamable $vgpr40_vgpr41_vgpr42_vgpr43 = FLAT_LOAD_DWORDX4 undef renamable $vgpr0_vgpr1, 0, 0, 0, 0, implicit $exec, implicit $flat_scr :: (load 16, addrspace 1)
; CHECK: renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @foo + 4, target-flags(amdgpu-rel32-hi) @foo + 4, implicit-def dead $scc
; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95
; CHECK: $sgpr4 = COPY $sgpr95
@ -38,9 +38,9 @@ body: |
; CHECK: renamable $sgpr6_sgpr7 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-rel32-lo) @bar + 4, target-flags(amdgpu-rel32-hi) @bar + 4, implicit-def dead $scc
; CHECK: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95
; CHECK: $sgpr4 = COPY $sgpr95
; CHECK: $vgpr0 = COPY renamable $vgpr32
; CHECK: $vgpr1 = COPY renamable $vgpr33
; CHECK: $vgpr2 = COPY renamable $vgpr34
; CHECK: $vgpr0 = COPY renamable $vgpr40
; CHECK: $vgpr1 = COPY renamable $vgpr41
; CHECK: $vgpr2 = COPY renamable $vgpr42
; CHECK: $vgpr3 = KILL undef renamable $vgpr3
; CHECK: dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr6_sgpr7, @bar, csr_amdgpu_highregs, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4, implicit $vgpr0, implicit killed $vgpr1, implicit killed $vgpr2, implicit killed $vgpr3, implicit-def $vgpr0
; CHECK: ADJCALLSTACKDOWN 0, 4, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32, implicit $sgpr95

View File

@ -1058,30 +1058,30 @@ declare void @external_void_func_void() #1
; GFX1064-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GFX1032-NEXT: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]]], -1{{$}}
; GCN-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
; GCN-NEXT: v_nop
; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]]
; GCN-NEXT: v_writelane_b32 v32, s33, 2
; GCN-NEXT: v_writelane_b32 v40, s33, 2
; GCN: s_mov_b32 s33, s32
; GFX1064: s_add_u32 s32, s32, 0x400
; GFX1032: s_add_u32 s32, s32, 0x200
; GCN-DAG: v_writelane_b32 v32, s30, 0
; GCN-DAG: v_writelane_b32 v32, s31, 1
; GCN-DAG: v_writelane_b32 v40, s30, 0
; GCN-DAG: v_writelane_b32 v40, s31, 1
; GCN: s_swappc_b64
; GCN-DAG: v_readlane_b32 s4, v32, 0
; GCN-DAG: v_readlane_b32 s5, v32, 1
; GCN-DAG: v_readlane_b32 s4, v40, 0
; GCN-DAG: v_readlane_b32 s5, v40, 1
; GFX1064: s_sub_u32 s32, s32, 0x400
; GFX1032: s_sub_u32 s32, s32, 0x200
; GCN: v_readlane_b32 s33, v32, 2
; GCN: v_readlane_b32 s33, v40, 2
; GFX1064: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
; GFX1032: s_or_saveexec_b32 [[COPY_EXEC1:s[0-9]]], -1{{$}}
; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
; GCN-NEXT: v_nop
; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC1]]
; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC1]]