mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
138ecf877e
We are relying on atrificial DAG edges inserted by the MemOpClusterMutation to keep loads and stores together in the post-RA scheduler. This does not work all the time since it allows to schedule a completely independent instruction in the middle of the cluster. Removed the DAG mutation and added pass to bundle already clustered instructions. These bundles are unpacked before the memory legalizer because it does not work with bundles but also because it allows to insert waitcounts in the middle of a store cluster. Removing artificial edges also allows a more relaxed scheduling. Differential Revision: https://reviews.llvm.org/D72737
549 lines
27 KiB
LLVM
549 lines
27 KiB
LLVM
; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX8 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
|
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
|
|
|
|
declare i64 @_Z13get_global_idj(i32)
|
|
|
|
define amdgpu_kernel void @clmem_read_simplified(i8 addrspace(1)* %buffer) {
|
|
; GCN-LABEL: clmem_read_simplified:
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
;
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
|
;
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
|
|
entry:
|
|
%call = tail call i64 @_Z13get_global_idj(i32 0)
|
|
%conv = and i64 %call, 255
|
|
%a0 = shl i64 %call, 7
|
|
%idx.ext11 = and i64 %a0, 4294934528
|
|
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
|
|
%saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
|
|
|
|
%addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
|
|
%load1 = load i64, i64 addrspace(1)* %addr1, align 8
|
|
%addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256
|
|
%load2 = load i64, i64 addrspace(1)* %addr2, align 8
|
|
%add.1 = add i64 %load2, %load1
|
|
|
|
%add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512
|
|
%load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
|
|
%add.2 = add i64 %load3, %add.1
|
|
%add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768
|
|
%load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
|
|
%add.3 = add i64 %load4, %add.2
|
|
|
|
%add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024
|
|
%load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
|
|
%add.4 = add i64 %load5, %add.3
|
|
%add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280
|
|
%load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
|
|
%add.5 = add i64 %load6, %add.4
|
|
|
|
%add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536
|
|
%load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
|
|
%add.6 = add i64 %load7, %add.5
|
|
%add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792
|
|
%load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
|
|
%add.7 = add i64 %load8, %add.6
|
|
|
|
store i64 %add.7, i64 addrspace(1)* %saddr, align 8
|
|
ret void
|
|
}
|
|
|
|
define hidden amdgpu_kernel void @clmem_read(i8 addrspace(1)* %buffer) {
|
|
; GCN-LABEL: clmem_read:
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
;
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
|
;
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
|
entry:
|
|
%call = tail call i64 @_Z13get_global_idj(i32 0)
|
|
%conv = and i64 %call, 255
|
|
%a0 = shl i64 %call, 17
|
|
%idx.ext11 = and i64 %a0, 4261412864
|
|
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
|
|
%a1 = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
|
|
%add.ptr6 = getelementptr inbounds i64, i64 addrspace(1)* %a1, i64 %conv
|
|
br label %for.cond.preheader
|
|
|
|
while.cond.loopexit: ; preds = %for.body
|
|
%dec = add nsw i32 %dec31, -1
|
|
%tobool = icmp eq i32 %dec31, 0
|
|
br i1 %tobool, label %while.end, label %for.cond.preheader
|
|
|
|
for.cond.preheader: ; preds = %entry, %while.cond.loopexit
|
|
%dec31 = phi i32 [ 127, %entry ], [ %dec, %while.cond.loopexit ]
|
|
%sum.030 = phi i64 [ 0, %entry ], [ %add.10, %while.cond.loopexit ]
|
|
br label %for.body
|
|
|
|
for.body: ; preds = %for.body, %for.cond.preheader
|
|
%block.029 = phi i32 [ 0, %for.cond.preheader ], [ %add9.31, %for.body ]
|
|
%sum.128 = phi i64 [ %sum.030, %for.cond.preheader ], [ %add.10, %for.body ]
|
|
%conv3 = zext i32 %block.029 to i64
|
|
%add.ptr8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3
|
|
%load1 = load i64, i64 addrspace(1)* %add.ptr8, align 8
|
|
%add = add i64 %load1, %sum.128
|
|
|
|
%add9 = or i32 %block.029, 256
|
|
%conv3.1 = zext i32 %add9 to i64
|
|
%add.ptr8.1 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.1
|
|
%load2 = load i64, i64 addrspace(1)* %add.ptr8.1, align 8
|
|
%add.1 = add i64 %load2, %add
|
|
|
|
%add9.1 = or i32 %block.029, 512
|
|
%conv3.2 = zext i32 %add9.1 to i64
|
|
%add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.2
|
|
%l3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
|
|
%add.2 = add i64 %l3, %add.1
|
|
|
|
%add9.2 = or i32 %block.029, 768
|
|
%conv3.3 = zext i32 %add9.2 to i64
|
|
%add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.3
|
|
%l4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
|
|
%add.3 = add i64 %l4, %add.2
|
|
|
|
%add9.3 = or i32 %block.029, 1024
|
|
%conv3.4 = zext i32 %add9.3 to i64
|
|
%add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.4
|
|
%l5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
|
|
%add.4 = add i64 %l5, %add.3
|
|
|
|
%add9.4 = or i32 %block.029, 1280
|
|
%conv3.5 = zext i32 %add9.4 to i64
|
|
%add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.5
|
|
%l6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
|
|
%add.5 = add i64 %l6, %add.4
|
|
|
|
%add9.5 = or i32 %block.029, 1536
|
|
%conv3.6 = zext i32 %add9.5 to i64
|
|
%add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.6
|
|
%load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
|
|
%add.6 = add i64 %load7, %add.5
|
|
|
|
%add9.6 = or i32 %block.029, 1792
|
|
%conv3.7 = zext i32 %add9.6 to i64
|
|
%add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.7
|
|
%load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
|
|
%add.7 = add i64 %load8, %add.6
|
|
|
|
%add9.7 = or i32 %block.029, 2048
|
|
%conv3.8 = zext i32 %add9.7 to i64
|
|
%add.ptr8.8 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.8
|
|
%load9 = load i64, i64 addrspace(1)* %add.ptr8.8, align 8
|
|
%add.8 = add i64 %load9, %add.7
|
|
|
|
%add9.8 = or i32 %block.029, 2304
|
|
%conv3.9 = zext i32 %add9.8 to i64
|
|
%add.ptr8.9 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.9
|
|
%load10 = load i64, i64 addrspace(1)* %add.ptr8.9, align 8
|
|
%add.9 = add i64 %load10, %add.8
|
|
|
|
%add9.9 = or i32 %block.029, 2560
|
|
%conv3.10 = zext i32 %add9.9 to i64
|
|
%add.ptr8.10 = getelementptr inbounds i64, i64 addrspace(1)* %add.ptr6, i64 %conv3.10
|
|
%load11 = load i64, i64 addrspace(1)* %add.ptr8.10, align 8
|
|
%add.10 = add i64 %load11, %add.9
|
|
|
|
%add9.31 = add nuw nsw i32 %block.029, 8192
|
|
%cmp.31 = icmp ult i32 %add9.31, 4194304
|
|
br i1 %cmp.31, label %for.body, label %while.cond.loopexit
|
|
|
|
while.end: ; preds = %while.cond.loopexit
|
|
store i64 %add.10, i64 addrspace(1)* %a1, align 8
|
|
ret void
|
|
}
|
|
|
|
; using 32bit address.
|
|
define amdgpu_kernel void @Address32(i8 addrspace(1)* %buffer) {
|
|
; GCN-LABEL: Address32:
|
|
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
|
;
|
|
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
|
|
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
|
|
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
|
|
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
|
|
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
|
|
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
|
|
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
|
|
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
;
|
|
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
|
|
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
|
|
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
|
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
|
|
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
|
|
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
|
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
|
|
entry:
|
|
%call = tail call i64 @_Z13get_global_idj(i32 0)
|
|
%conv = and i64 %call, 255
|
|
%id = shl i64 %call, 7
|
|
%idx.ext11 = and i64 %id, 4294934528
|
|
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
|
|
%addr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)*
|
|
|
|
%add.ptr6 = getelementptr inbounds i32, i32 addrspace(1)* %addr, i64 %conv
|
|
%load1 = load i32, i32 addrspace(1)* %add.ptr6, align 4
|
|
|
|
%add.ptr8.1 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 256
|
|
%load2 = load i32, i32 addrspace(1)* %add.ptr8.1, align 4
|
|
%add.1 = add i32 %load2, %load1
|
|
|
|
%add.ptr8.2 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 512
|
|
%load3 = load i32, i32 addrspace(1)* %add.ptr8.2, align 4
|
|
%add.2 = add i32 %load3, %add.1
|
|
|
|
%add.ptr8.3 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 768
|
|
%load4 = load i32, i32 addrspace(1)* %add.ptr8.3, align 4
|
|
%add.3 = add i32 %load4, %add.2
|
|
|
|
%add.ptr8.4 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1024
|
|
%load5 = load i32, i32 addrspace(1)* %add.ptr8.4, align 4
|
|
%add.4 = add i32 %load5, %add.3
|
|
|
|
%add.ptr8.5 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1280
|
|
%load6 = load i32, i32 addrspace(1)* %add.ptr8.5, align 4
|
|
%add.5 = add i32 %load6, %add.4
|
|
|
|
%add.ptr8.6 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1536
|
|
%load7 = load i32, i32 addrspace(1)* %add.ptr8.6, align 4
|
|
%add.6 = add i32 %load7, %add.5
|
|
|
|
%add.ptr8.7 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 1792
|
|
%load8 = load i32, i32 addrspace(1)* %add.ptr8.7, align 4
|
|
%add.7 = add i32 %load8, %add.6
|
|
|
|
%add.ptr8.8 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2048
|
|
%load9 = load i32, i32 addrspace(1)* %add.ptr8.8, align 4
|
|
%add.8 = add i32 %load9, %add.7
|
|
|
|
%add.ptr8.9 = getelementptr inbounds i32, i32 addrspace(1)* %add.ptr6, i64 2304
|
|
%load10 = load i32, i32 addrspace(1)* %add.ptr8.9, align 4
|
|
%add.9 = add i32 %load10, %add.8
|
|
|
|
store i32 %add.9, i32 addrspace(1)* %addr, align 4
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @Offset64(i8 addrspace(1)* %buffer) {
|
|
; GCN-LABEL: Offset64:
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
;
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
;
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
entry:
|
|
%call = tail call i64 @_Z13get_global_idj(i32 0)
|
|
%conv = and i64 %call, 255
|
|
%a0 = shl i64 %call, 7
|
|
%idx.ext11 = and i64 %a0, 4294934528
|
|
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
|
|
%saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
|
|
|
|
%addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
|
|
%load1 = load i64, i64 addrspace(1)* %addr1, align 8
|
|
|
|
%addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870400
|
|
%load2 = load i64, i64 addrspace(1)* %addr2, align 8
|
|
|
|
%add1 = add i64 %load2, %load1
|
|
|
|
%addr3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870656
|
|
%load3 = load i64, i64 addrspace(1)* %addr3, align 8
|
|
|
|
%add2 = add i64 %load3, %add1
|
|
|
|
%addr4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 536870912
|
|
%load4 = load i64, i64 addrspace(1)* %addr4, align 8
|
|
%add4 = add i64 %load4, %add2
|
|
|
|
store i64 %add4, i64 addrspace(1)* %saddr, align 8
|
|
ret void
|
|
}
|
|
|
|
; TODO: Support load4 as anchor instruction.
|
|
define amdgpu_kernel void @p32Offset64(i8 addrspace(1)* %buffer) {
|
|
; GCN-LABEL: p32Offset64:
|
|
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
|
|
;
|
|
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:2048
|
|
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:3072
|
|
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX9: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
;
|
|
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
|
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dword {{v[0-9]+}}, v[{{[0-9]+:[0-9]+}}], off offset:1024
|
|
entry:
|
|
%call = tail call i64 @_Z13get_global_idj(i32 0)
|
|
%conv = and i64 %call, 255
|
|
%a0 = shl i64 %call, 7
|
|
%idx.ext11 = and i64 %a0, 4294934528
|
|
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
|
|
%saddr = bitcast i8 addrspace(1)* %add.ptr12 to i32 addrspace(1)*
|
|
|
|
%addr1 = getelementptr inbounds i32, i32 addrspace(1)* %saddr, i64 %conv
|
|
%load1 = load i32, i32 addrspace(1)* %addr1, align 8
|
|
|
|
%addr2 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870400
|
|
%load2 = load i32, i32 addrspace(1)* %addr2, align 8
|
|
|
|
%add1 = add i32 %load2, %load1
|
|
|
|
%addr3 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870656
|
|
%load3 = load i32, i32 addrspace(1)* %addr3, align 8
|
|
|
|
%add2 = add i32 %load3, %add1
|
|
|
|
%addr4 = getelementptr inbounds i32, i32 addrspace(1)* %addr1, i64 536870912
|
|
%load4 = load i32, i32 addrspace(1)* %addr4, align 8
|
|
%add4 = add i32 %load4, %add2
|
|
|
|
store i32 %add4, i32 addrspace(1)* %saddr, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @DiffBase(i8 addrspace(1)* %buffer1,
|
|
; GCN-LABEL: DiffBase:
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
;
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-4096
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
|
;
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
i8 addrspace(1)* %buffer2) {
|
|
entry:
|
|
%call = tail call i64 @_Z13get_global_idj(i32 0)
|
|
%conv = and i64 %call, 255
|
|
%a0 = shl i64 %call, 7
|
|
%idx.ext11 = and i64 %a0, 4294934528
|
|
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer1, i64 %idx.ext11
|
|
%saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
|
|
|
|
%add.ptr2 = getelementptr inbounds i8, i8 addrspace(1)* %buffer2, i64 %idx.ext11
|
|
%saddr2 = bitcast i8 addrspace(1)* %add.ptr2 to i64 addrspace(1)*
|
|
|
|
%addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 512
|
|
%load1 = load i64, i64 addrspace(1)* %addr1, align 8
|
|
%add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 768
|
|
%load2 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
|
|
%add1 = add i64 %load2, %load1
|
|
%add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 1024
|
|
%load3 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
|
|
%add2 = add i64 %load3, %add1
|
|
|
|
%add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1280
|
|
%load4 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
|
|
|
|
%add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1536
|
|
%load5 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
|
|
%add3 = add i64 %load5, %load4
|
|
|
|
%add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %saddr2, i64 1792
|
|
%load6 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
|
|
%add4 = add i64 %load6, %add3
|
|
|
|
%add5 = add i64 %add2, %add4
|
|
|
|
store i64 %add5, i64 addrspace(1)* %saddr, align 8
|
|
ret void
|
|
}
|
|
|
|
define amdgpu_kernel void @ReverseOrder(i8 addrspace(1)* %buffer) {
|
|
; GCN-LABEL: ReverseOrder:
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
;
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
|
;
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off{{$}}
|
|
entry:
|
|
%call = tail call i64 @_Z13get_global_idj(i32 0)
|
|
%conv = and i64 %call, 255
|
|
%a0 = shl i64 %call, 7
|
|
%idx.ext11 = and i64 %a0, 4294934528
|
|
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
|
|
%saddr = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
|
|
|
|
%addr1 = getelementptr inbounds i64, i64 addrspace(1)* %saddr, i64 %conv
|
|
%load1 = load i64, i64 addrspace(1)* %addr1, align 8
|
|
|
|
%add.ptr8.7 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1792
|
|
%load8 = load i64, i64 addrspace(1)* %add.ptr8.7, align 8
|
|
%add7 = add i64 %load8, %load1
|
|
|
|
%add.ptr8.6 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1536
|
|
%load7 = load i64, i64 addrspace(1)* %add.ptr8.6, align 8
|
|
%add6 = add i64 %load7, %add7
|
|
|
|
%add.ptr8.5 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1280
|
|
%load6 = load i64, i64 addrspace(1)* %add.ptr8.5, align 8
|
|
%add5 = add i64 %load6, %add6
|
|
|
|
%add.ptr8.4 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 1024
|
|
%load5 = load i64, i64 addrspace(1)* %add.ptr8.4, align 8
|
|
%add4 = add i64 %load5, %add5
|
|
|
|
%add.ptr8.3 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 768
|
|
%load4 = load i64, i64 addrspace(1)* %add.ptr8.3, align 8
|
|
%add3 = add i64 %load4, %add4
|
|
|
|
%add.ptr8.2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 512
|
|
%load3 = load i64, i64 addrspace(1)* %add.ptr8.2, align 8
|
|
%add2 = add i64 %load3, %add3
|
|
|
|
%addr2 = getelementptr inbounds i64, i64 addrspace(1)* %addr1, i64 256
|
|
%load2 = load i64, i64 addrspace(1)* %addr2, align 8
|
|
%add1 = add i64 %load2, %add2
|
|
|
|
store i64 %add1, i64 addrspace(1)* %saddr, align 8
|
|
ret void
|
|
}
|
|
|
|
define hidden amdgpu_kernel void @negativeoffset(i8 addrspace(1)* nocapture %buffer) {
|
|
; GCN-LABEL: negativeoffset:
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
; GFX8: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
|
|
;
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:2048
|
|
; GFX9: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
|
;
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off offset:-2048
|
|
; GFX10: global_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], off
|
|
entry:
|
|
%call = tail call i64 @_Z13get_global_idj(i32 0) #2
|
|
%conv = and i64 %call, 255
|
|
%0 = shl i64 %call, 7
|
|
%idx.ext11 = and i64 %0, 4294934528
|
|
%add.ptr12 = getelementptr inbounds i8, i8 addrspace(1)* %buffer, i64 %idx.ext11
|
|
%buffer_head = bitcast i8 addrspace(1)* %add.ptr12 to i64 addrspace(1)*
|
|
|
|
%buffer_wave = getelementptr inbounds i64, i64 addrspace(1)* %buffer_head, i64 %conv
|
|
|
|
%addr1 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870656
|
|
%load1 = load i64, i64 addrspace(1)* %addr1, align 8
|
|
|
|
%addr2 = getelementptr inbounds i64, i64 addrspace(1)* %buffer_wave, i64 -536870912
|
|
%load2 = load i64, i64 addrspace(1)* %addr2, align 8
|
|
|
|
|
|
%add = add i64 %load2, %load1
|
|
|
|
store i64 %add, i64 addrspace(1)* %buffer_head, align 8
|
|
ret void
|
|
}
|