1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00
llvm-mirror/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
Tom Stellard fe894cb93a AMDGPU/SILoadStoreOptimizer: Improve merging of out of order offsets
Summary:
This improves merging of sequences like:

store a, ptr + 4
store b, ptr + 8
store c, ptr + 12
store d, ptr + 16
store e, ptr + 20
store f, ptr

Prior to this patch the basic block was scanned in order to find instructions
to merge and the above sequence would be transformed to:

store4 <a, b, c, d>, ptr + 4
store e, ptr + 20
store r, ptr

With this change, we now sort all the candidate merge instructions by their offset,
so instructions are visited in offset order rather than in the order they appear
in the basic block.  We now transform this sequnce into:

store4 <f, a, b, c>, ptr
store2 <d, e>, ptr + 16

Another benefit of this change is that since we have sorted the mergeable lists
by offset, we can easily check if an instruction is mergeable by checking the
offset of the instruction that becomes before or after it in the sorted list.
Once we determine an instruction is not mergeable we can remove it from the list
and avoid having to do the more expensive mergeablilty checks.

Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin

Reviewed By: arsenm, nhaehnle

Subscribers: kerbowa, merge_guards_bot, kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D65966
2020-01-24 19:45:56 -08:00

44 lines
2.0 KiB
LLVM

; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI %s
@lds = addrspace(3) global [512 x float] undef, align 4
; offset0 is larger than offset1
; SI-LABEL: {{^}}offset_order:
; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset1:14{{$}}
; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1024
; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:12
define amdgpu_kernel void @offset_order(float addrspace(1)* %out) {
entry:
%ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 0
%val0 = load float, float addrspace(3)* %ptr0
%ptr1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 256
%val1 = load float, float addrspace(3)* %ptr1
%add1 = fadd float %val0, %val1
%ptr2 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 3
%val2 = load float, float addrspace(3)* %ptr2
%add2 = fadd float %add1, %val2
%ptr3 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2
%val3 = load float, float addrspace(3)* %ptr3
%add3 = fadd float %add2, %val3
%ptr4 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 12
%val4 = load float, float addrspace(3)* %ptr4
%add4 = fadd float %add3, %val4
%ptr5 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 14
%val5 = load float, float addrspace(3)* %ptr5
%add5 = fadd float %add4, %val5
%ptr6 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11
%val6 = load float, float addrspace(3)* %ptr6
%add6 = fadd float %add5, %val6
store float %add6, float addrspace(1)* %out
ret void
}