mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 18:54:02 +01:00
cca61dc4bb
Get rid of all fixmes and base heuristic on `num-clustered-dwords`. The main intuition behind this is as follows. The existing heuristic roughly summarizes as below: * Assume, all the mem ops instructions participating in the clustering process, loads/stores same num bytes * If num bytes loaded by each mem op is 4 bytes, then cluster at max 5 mem ops, that is at max 20 bytes * If num bytes loaded by each mem op is 8 bytes, then cluster at max 3 mem ops, that is at max 24 bytes * If num bytes loaded by each mem op is 16 bytes, then cluster at max 2 mem ops, that is at max 32 bytes So, we need to make sure that the new heuristic do not completey deviate away from the above one, and it properly handles both the sub-word loads and the wide loads. Reviewed By: arsenm, rampitec Differential Revision: https://reviews.llvm.org/D84354
277 lines
12 KiB
LLVM
277 lines
12 KiB
LLVM
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-NOSDWA -check-prefix=FUNC %s
|
|
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=SI-SDWA -check-prefix=FUNC %s
|
|
; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG -check-prefix=FUNC %s
|
|
|
|
declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone
|
|
declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone
|
|
declare i16 @llvm.cttz.i16(i16, i1) nounwind readnone
|
|
declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
|
|
declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
|
|
declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
|
|
declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
|
|
declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
|
|
|
; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32:
|
|
; SI: s_load_dword [[VAL:s[0-9]+]],
|
|
; SI: s_ff1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
|
|
; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
|
|
; SI: buffer_store_dword [[VRESULT]],
|
|
; SI: s_endpgm
|
|
; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
|
|
; EG: FFBL_INT {{\*? *}}[[RESULT]]
|
|
define amdgpu_kernel void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
|
|
%cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
|
|
store i32 %cttz, i32 addrspace(1)* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32:
|
|
; SI: {{buffer|flat}}_load_dword [[VAL:v[0-9]+]],
|
|
; SI: v_ffbl_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
|
|
; SI: buffer_store_dword [[RESULT]],
|
|
; SI: s_endpgm
|
|
; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
|
|
; EG: FFBL_INT {{\*? *}}[[RESULT]]
|
|
define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%in.gep = getelementptr i32, i32 addrspace(1)* %valptr, i32 %tid
|
|
%val = load i32, i32 addrspace(1)* %in.gep, align 4
|
|
%cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
|
|
store i32 %cttz, i32 addrspace(1)* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
; FUNC-LABEL: {{^}}v_cttz_zero_undef_v2i32:
|
|
; SI: {{buffer|flat}}_load_dwordx2
|
|
; SI: v_ffbl_b32_e32
|
|
; SI: v_ffbl_b32_e32
|
|
; SI: buffer_store_dwordx2
|
|
; SI: s_endpgm
|
|
; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
|
|
; EG: FFBL_INT {{\*? *}}[[RESULT]]
|
|
; EG: FFBL_INT {{\*? *}}[[RESULT]]
|
|
define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%in.gep = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %valptr, i32 %tid
|
|
%val = load <2 x i32>, <2 x i32> addrspace(1)* %in.gep, align 8
|
|
%cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
|
|
store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
|
|
ret void
|
|
}
|
|
|
|
; FUNC-LABEL: {{^}}v_cttz_zero_undef_v4i32:
|
|
; SI: {{buffer|flat}}_load_dwordx4
|
|
; SI: v_ffbl_b32_e32
|
|
; SI: v_ffbl_b32_e32
|
|
; SI: v_ffbl_b32_e32
|
|
; SI: v_ffbl_b32_e32
|
|
; SI: buffer_store_dwordx4
|
|
; SI: s_endpgm
|
|
; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
|
|
; EG: FFBL_INT {{\*? *}}[[RESULT]]
|
|
; EG: FFBL_INT {{\*? *}}[[RESULT]]
|
|
; EG: FFBL_INT {{\*? *}}[[RESULT]]
|
|
; EG: FFBL_INT {{\*? *}}[[RESULT]]
|
|
define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
|
|
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
|
%in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %valptr, i32 %tid
|
|
%val = load <4 x i32>, <4 x i32> addrspace(1)* %in.gep, align 16
|
|
%cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
|
|
store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
|
|
ret void
|
|
}
|
|
|
|
; FUNC-LABEL: {{^}}s_cttz_zero_undef_i8_with_select:
|
|
; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}}
|
|
; EG: MEM_RAT MSKOR
|
|
; EG: FFBL_INT
|
|
define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noalias %out, i8 %val) nounwind {
|
|
%cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone
|
|
%cttz_ret = icmp ne i8 %val, 0
|
|
%ret = select i1 %cttz_ret, i8 %cttz, i8 32
|
|
store i8 %cttz, i8 addrspace(1)* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
; FUNC-LABEL: {{^}}s_cttz_zero_undef_i16_with_select:
|
|
; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}}
|
|
; EG: MEM_RAT MSKOR
|
|
; EG: FFBL_INT
|
|
define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(i16 addrspace(1)* noalias %out, i16 %val) nounwind {
|
|
%cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone
|
|
%cttz_ret = icmp ne i16 %val, 0
|
|
%ret = select i1 %cttz_ret, i16 %cttz, i16 32
|
|
store i16 %cttz, i16 addrspace(1)* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
; FUNC-LABEL: {{^}}s_cttz_zero_undef_i32_with_select:
|
|
; SI: s_ff1_i32_b32
|
|
; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
|
|
; EG: FFBL_INT {{\*? *}}[[RESULT]]
|
|
define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
|
|
%cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
|
|
%cttz_ret = icmp ne i32 %val, 0
|
|
%ret = select i1 %cttz_ret, i32 %cttz, i32 32
|
|
store i32 %cttz, i32 addrspace(1)* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
; FUNC-LABEL: {{^}}s_cttz_zero_undef_i64_with_select:
|
|
; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}}
|
|
; SI: s_ff1_i32_b32 s{{[0-9]+}}, s{{[0-9]+}}
|
|
; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
|
|
define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
|
|
%cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
|
|
%cttz_ret = icmp ne i64 %val, 0
|
|
%ret = select i1 %cttz_ret, i64 %cttz, i64 32
|
|
store i64 %cttz, i64 addrspace(1)* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
; FUNC-LABEL: {{^}}v_cttz_zero_undef_i8_with_select:
|
|
; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
|
; SI-SDWA: v_ffbl_b32_e32
|
|
; EG: MEM_RAT MSKOR
|
|
define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind {
|
|
%val = load i8, i8 addrspace(1)* %arrayidx, align 1
|
|
%cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone
|
|
%cttz_ret = icmp ne i8 %val, 0
|
|
%ret = select i1 %cttz_ret, i8 %cttz, i8 32
|
|
store i8 %ret, i8 addrspace(1)* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
; FUNC-LABEL: {{^}}v_cttz_zero_undef_i16_with_select:
|
|
; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
|
; SI-SDWA: v_ffbl_b32_e32
|
|
; EG: MEM_RAT MSKOR
|
|
define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind {
|
|
%val = load i16, i16 addrspace(1)* %arrayidx, align 1
|
|
%cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone
|
|
%cttz_ret = icmp ne i16 %val, 0
|
|
%ret = select i1 %cttz_ret, i16 %cttz, i16 32
|
|
store i16 %ret, i16 addrspace(1)* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
; FUNC-LABEL: {{^}}v_cttz_zero_undef_i32_with_select:
|
|
; SI-DAG: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
|
; SI-DAG: v_cmp_ne_u32_e32 vcc, 0
|
|
; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
|
|
define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
|
|
%val = load i32, i32 addrspace(1)* %arrayidx, align 1
|
|
%cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
|
|
%cttz_ret = icmp ne i32 %val, 0
|
|
%ret = select i1 %cttz_ret, i32 %cttz, i32 32
|
|
store i32 %ret, i32 addrspace(1)* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
; FUNC-LABEL: {{^}}v_cttz_zero_undef_i64_with_select:
|
|
; SI-NOSDWA: v_or_b32_e32
|
|
; SI-NOSDWA: v_or_b32_e32
|
|
; SI-NOSDWA: v_or_b32_e32
|
|
; SI-NOSDWA: v_or_b32_e32
|
|
; SI-NOSDWA: v_or_b32_e32 [[VAL1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
|
|
; SI-NOSDWA: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
|
|
; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]]
|
|
; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]]
|
|
; SI-SDWA: v_or_b32_e32
|
|
; SI-SDWA: v_or_b32_sdwa
|
|
; SI-SDWA: v_or_b32_e32
|
|
; SI-SDWA: v_or_b32_sdwa
|
|
; SI-SDWA: v_or_b32_e32 [[VAL1:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
|
|
; SI-SDWA: v_or_b32_e32 [[VAL2:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
|
|
; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL1]]
|
|
; SI-SDWA: v_ffbl_b32_e32 v{{[0-9]+}}, [[VAL2]]
|
|
; SI: v_cmp_eq_u32_e32 vcc, 0
|
|
; SI: v_cmp_ne_u64_e32 vcc, 0
|
|
; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
|
|
define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(i64 addrspace(1)* noalias %out, i64 addrspace(1)* nocapture readonly %arrayidx) nounwind {
|
|
%val = load i64, i64 addrspace(1)* %arrayidx, align 1
|
|
%cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
|
|
%cttz_ret = icmp ne i64 %val, 0
|
|
%ret = select i1 %cttz_ret, i64 %cttz, i64 32
|
|
store i64 %ret, i64 addrspace(1)* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
; FUNC-LABEL: {{^}}v_cttz_i32_sel_eq_neg1:
|
|
; SI: v_ffbl_b32_e32 [[VAL:v[0-9]+]], v{{[0-9]+}}
|
|
; SI: buffer_store_dword [[VAL]],
|
|
; SI: s_endpgm
|
|
; EG: MEM_RAT_CACHELESS STORE_RAW
|
|
; EG: FFBL_INT
|
|
define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
|
|
%val = load i32, i32 addrspace(1)* %arrayidx, align 1
|
|
%ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
|
|
%cmp = icmp eq i32 %val, 0
|
|
%sel = select i1 %cmp, i32 -1, i32 %ctlz
|
|
store i32 %sel, i32 addrspace(1)* %out
|
|
ret void
|
|
}
|
|
|
|
; FUNC-LABEL: {{^}}v_cttz_i32_sel_ne_neg1:
|
|
; SI: v_ffbl_b32_e32 [[VAL:v[0-9]+]], v{{[0-9]+}}
|
|
; SI: buffer_store_dword [[VAL]],
|
|
; SI: s_endpgm
|
|
; EG: MEM_RAT_CACHELESS STORE_RAW
|
|
; EG: FFBL_INT
|
|
define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
|
|
%val = load i32, i32 addrspace(1)* %arrayidx, align 1
|
|
%ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
|
|
%cmp = icmp ne i32 %val, 0
|
|
%sel = select i1 %cmp, i32 %ctlz, i32 -1
|
|
store i32 %sel, i32 addrspace(1)* %out
|
|
ret void
|
|
}
|
|
|
|
; FUNC-LABEL: {{^}}v_cttz_i32_sel_ne_bitwidth:
|
|
; SI: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
|
; SI: v_cmp
|
|
; SI: v_cndmask
|
|
; SI: s_endpgm
|
|
; EG: MEM_RAT_CACHELESS STORE_RAW
|
|
; EG: FFBL_INT
|
|
define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* nocapture readonly %arrayidx) nounwind {
|
|
%val = load i32, i32 addrspace(1)* %arrayidx, align 1
|
|
%ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
|
|
%cmp = icmp ne i32 %ctlz, 32
|
|
%sel = select i1 %cmp, i32 %ctlz, i32 -1
|
|
store i32 %sel, i32 addrspace(1)* %out
|
|
ret void
|
|
}
|
|
|
|
; FUNC-LABEL: {{^}}v_cttz_i8_sel_eq_neg1:
|
|
; SI: {{buffer|flat}}_load_ubyte
|
|
; SI-NOSDWA: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
|
; SI-SDWA: v_ffbl_b32_e32
|
|
; EG: MEM_RAT MSKOR
|
|
; EG: FFBL_INT
|
|
define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* nocapture readonly %arrayidx) nounwind {
|
|
%val = load i8, i8 addrspace(1)* %arrayidx, align 1
|
|
%ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
|
|
%cmp = icmp eq i8 %val, 0
|
|
%sel = select i1 %cmp, i8 -1, i8 %ctlz
|
|
store i8 %sel, i8 addrspace(1)* %out
|
|
ret void
|
|
}
|
|
|
|
; FUNC-LABEL: {{^}}v_cttz_i16_sel_eq_neg1:
|
|
; SI: {{buffer|flat}}_load_ubyte
|
|
; SI: v_ffbl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
|
|
; SI: buffer_store_short
|
|
; EG: MEM_RAT MSKOR
|
|
; EG: FFBL_INT
|
|
define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* nocapture readonly %arrayidx) nounwind {
|
|
%val = load i16, i16 addrspace(1)* %arrayidx, align 1
|
|
%ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
|
|
%cmp = icmp eq i16 %val, 0
|
|
%sel = select i1 %cmp, i16 -1, i16 %ctlz
|
|
store i16 %sel, i16 addrspace(1)* %out
|
|
ret void
|
|
}
|
|
|
|
|