mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
138ecf877e
We are relying on atrificial DAG edges inserted by the MemOpClusterMutation to keep loads and stores together in the post-RA scheduler. This does not work all the time since it allows to schedule a completely independent instruction in the middle of the cluster. Removed the DAG mutation and added pass to bundle already clustered instructions. These bundles are unpacked before the memory legalizer because it does not work with bundles but also because it allows to insert waitcounts in the middle of a store cluster. Removing artificial edges also allows a more relaxed scheduling. Differential Revision: https://reviews.llvm.org/D72737
81 lines
3.5 KiB
LLVM
81 lines
3.5 KiB
LLVM
; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
|
|
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
|
|
|
|
@local_memory.local_mem = internal unnamed_addr addrspace(3) global [128 x i32] undef, align 4
|
|
|
|
; GCN-LABEL: {{^}}local_memory:
|
|
|
|
; GCN-NOT: s_wqm_b64
|
|
; GCN: ds_write_b32
|
|
|
|
; GCN: s_barrier
|
|
|
|
; GCN: ds_read_b32 {{v[0-9]+}},
|
|
define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out) #0 {
|
|
entry:
|
|
%y.i = call i32 @llvm.amdgcn.workitem.id.x() #1
|
|
%arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
|
|
store i32 %y.i, i32 addrspace(3)* %arrayidx, align 4
|
|
%add = add nsw i32 %y.i, 1
|
|
%cmp = icmp eq i32 %add, 16
|
|
%.add = select i1 %cmp, i32 0, i32 %add
|
|
call void @llvm.amdgcn.s.barrier()
|
|
%arrayidx1 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %.add
|
|
%tmp = load i32, i32 addrspace(3)* %arrayidx1, align 4
|
|
%arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %y.i
|
|
store i32 %tmp, i32 addrspace(1)* %arrayidx2, align 4
|
|
ret void
|
|
}
|
|
|
|
@local_memory_two_objects.local_mem0 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
|
|
@local_memory_two_objects.local_mem1 = internal unnamed_addr addrspace(3) global [4 x i32] undef, align 4
|
|
|
|
; Check that the LDS size emitted correctly
|
|
; EG: .long 166120
|
|
; EG-NEXT: .long 8
|
|
; GCN: .long 47180
|
|
; GCN-NEXT: .long 32900
|
|
|
|
; GCN-LABEL: {{^}}local_memory_two_objects:
|
|
; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
|
|
; CI-DAG: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
|
|
; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
|
|
; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
|
|
; SI-DAG: v_sub_i32_e32 [[SUB0:v[0-9]+]], vcc, 28, [[ADDRW]]
|
|
|
|
; GCN: s_barrier
|
|
|
|
; SI: v_sub_i32_e32 [[SUB1:v[0-9]+]], vcc, 12, [[ADDRW]]
|
|
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB0]]
|
|
; SI-DAG: ds_read_b32 v{{[0-9]+}}, [[SUB1]]
|
|
; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
|
|
|
|
define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
|
|
entry:
|
|
%x.i = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
|
|
store i32 %x.i, i32 addrspace(3)* %arrayidx, align 4
|
|
%mul = shl nsw i32 %x.i, 1
|
|
%arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %x.i
|
|
store i32 %mul, i32 addrspace(3)* %arrayidx1, align 4
|
|
%sub = sub nsw i32 3, %x.i
|
|
call void @llvm.amdgcn.s.barrier()
|
|
%arrayidx2 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %sub
|
|
%tmp = load i32, i32 addrspace(3)* %arrayidx2, align 4
|
|
%arrayidx3 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %x.i
|
|
store i32 %tmp, i32 addrspace(1)* %arrayidx3, align 4
|
|
%arrayidx4 = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem1, i32 0, i32 %sub
|
|
%tmp1 = load i32, i32 addrspace(3)* %arrayidx4, align 4
|
|
%add = add nsw i32 %x.i, 4
|
|
%arrayidx5 = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %add
|
|
store i32 %tmp1, i32 addrspace(1)* %arrayidx5, align 4
|
|
ret void
|
|
}
|
|
|
|
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
|
declare void @llvm.amdgcn.s.barrier() #2
|
|
|
|
attributes #0 = { nounwind }
|
|
attributes #1 = { nounwind readnone }
|
|
attributes #2 = { convergent nounwind }
|