mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 10:42:39 +01:00
0a1e1e6e9f
Detailed description: SIFoldOperands::foldInstOperand iterates over the operand uses calling the function that changes def-use iteratorson the way. As a result loop exits immediately when def-use iterator is changed. Hence, the operand is folded to the very first use instruction only. This makes VGPR live along the whole basic block and increases register pressure significantly. The performance drop observed in SHOC DeviceMemory test is caused by this bug. Proposed fix: collect uses to separate container for further processing in another loop. Testing: make check-llvm SHOC performance test. Reviewers: rampitec, ronlieb Differential Revision: https://reviews.llvm.org/D56161 llvm-svn: 350350
460 lines
21 KiB
LLVM
460 lines
21 KiB
LLVM
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+fp32-denormals < %s | FileCheck --check-prefixes=GCN,GCN-DENORM %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-fp32-denormals < %s | FileCheck --check-prefixes=GCN,GCN-FLUSH %s
|
|
|
|
; GCN-LABEL: {{^}}div_1_by_x_25ulp:
|
|
; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
|
|
; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
|
|
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
|
|
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
|
|
; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
|
|
; GCN-DENORM: v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]]
|
|
; GCN-DENORM: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
|
|
; GCN-DENORM: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
|
|
|
|
; GCN-FLUSH: v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]]
|
|
|
|
; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off
|
|
define amdgpu_kernel void @div_1_by_x_25ulp(float addrspace(1)* %arg) {
|
|
%load = load float, float addrspace(1)* %arg, align 4
|
|
%div = fdiv float 1.000000e+00, %load, !fpmath !0
|
|
store float %div, float addrspace(1)* %arg, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}div_minus_1_by_x_25ulp:
|
|
; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
|
|
; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
|
|
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
|
|
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
|
|
; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
|
|
; GCN-DENORM: v_mul_f32_e64 [[PRESCALED:v[0-9]+]], [[VAL]], -[[SCALE]]
|
|
; GCN-DENORM: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
|
|
; GCN-DENORM: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
|
|
|
|
; GCN-FLUSH: v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]]
|
|
|
|
; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off
|
|
define amdgpu_kernel void @div_minus_1_by_x_25ulp(float addrspace(1)* %arg) {
|
|
%load = load float, float addrspace(1)* %arg, align 4
|
|
%div = fdiv float -1.000000e+00, %load, !fpmath !0
|
|
store float %div, float addrspace(1)* %arg, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}div_1_by_minus_x_25ulp:
|
|
; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
|
|
; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
|
|
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
|
|
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
|
|
; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
|
|
; GCN-DENORM: v_mul_f32_e64 [[PRESCALED:v[0-9]+]], -[[VAL]], [[SCALE]]
|
|
; GCN-DENORM: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
|
|
; GCN-DENORM: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
|
|
|
|
; GCN-FLUSH: v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]]
|
|
|
|
; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off
|
|
define amdgpu_kernel void @div_1_by_minus_x_25ulp(float addrspace(1)* %arg) {
|
|
%load = load float, float addrspace(1)* %arg, align 4
|
|
%neg = fsub float -0.000000e+00, %load
|
|
%div = fdiv float 1.000000e+00, %neg, !fpmath !0
|
|
store float %div, float addrspace(1)* %arg, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}div_minus_1_by_minus_x_25ulp:
|
|
; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
|
|
; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
|
|
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
|
|
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
|
|
; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
|
|
; GCN-DENORM: v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]]
|
|
; GCN-DENORM: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
|
|
; GCN-DENORM: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
|
|
|
|
; GCN-FLUSH: v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]]
|
|
|
|
; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off
|
|
define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(float addrspace(1)* %arg) {
|
|
%load = load float, float addrspace(1)* %arg, align 4
|
|
%neg = fsub float -0.000000e+00, %load
|
|
%div = fdiv float -1.000000e+00, %neg, !fpmath !0
|
|
store float %div, float addrspace(1)* %arg, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}div_v4_1_by_x_25ulp:
|
|
; GCN-DAG: s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
|
|
; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000
|
|
; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
|
|
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
|
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
|
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
|
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
|
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
|
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
|
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
|
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
|
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_mul_f32_e32
|
|
; GCN-DENORM-DAG: v_mul_f32_e32
|
|
; GCN-DENORM-DAG: v_mul_f32_e32
|
|
; GCN-DENORM-DAG: v_mul_f32_e32
|
|
|
|
; GCN-FLUSH: v_rcp_f32_e32 v[[OUT0:[0-9]+]], s[[VAL0]]
|
|
; GCN-FLUSH: v_rcp_f32_e32
|
|
; GCN-FLUSH: v_rcp_f32_e32
|
|
; GCN-FLUSH: v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]]
|
|
; GCN-FLUSH: global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off
|
|
define amdgpu_kernel void @div_v4_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
|
|
%load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
|
|
%div = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %load, !fpmath !0
|
|
store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}div_v4_minus_1_by_x_25ulp:
|
|
; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000
|
|
; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
|
|
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
|
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
|
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
|
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
|
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
|
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
|
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
|
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
|
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_mul_f32_e32
|
|
; GCN-DENORM-DAG: v_mul_f32_e32
|
|
; GCN-DENORM-DAG: v_mul_f32_e32
|
|
; GCN-DENORM-DAG: v_mul_f32_e32
|
|
|
|
; GCN-FLUSH: v_rcp_f32_e64 v[[OUT0:[0-9]+]], -s[[VAL0]]
|
|
; GCN-FLUSH: v_rcp_f32_e64
|
|
; GCN-FLUSH: v_rcp_f32_e64
|
|
; GCN-FLUSH: v_rcp_f32_e64 v[[OUT3:[0-9]+]], -s[[VAL3]]
|
|
define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
|
|
%load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
|
|
%div = fdiv <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, %load, !fpmath !0
|
|
store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}div_v4_1_by_minus_x_25ulp:
|
|
; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000
|
|
; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
|
|
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
|
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
|
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
|
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
|
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
|
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
|
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
|
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
|
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_mul_f32_e32
|
|
; GCN-DENORM-DAG: v_mul_f32_e32
|
|
; GCN-DENORM-DAG: v_mul_f32_e32
|
|
; GCN-DENORM-DAG: v_mul_f32_e32
|
|
|
|
; GCN-FLUSH: v_rcp_f32_e64 v[[OUT0:[0-9]+]], -s[[VAL0]]
|
|
; GCN-FLUSH: v_rcp_f32_e64
|
|
; GCN-FLUSH: v_rcp_f32_e64
|
|
; GCN-FLUSH: v_rcp_f32_e64 v[[OUT3:[0-9]+]], -s[[VAL3]]
|
|
; GCN-FLUSH: global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off
|
|
define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
|
|
%load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
|
|
%neg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %load
|
|
%div = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %neg, !fpmath !0
|
|
store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}div_v4_minus_1_by_minus_x_25ulp:
|
|
; GCN-DAG: s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
|
|
; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000
|
|
; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
|
|
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
|
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
|
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
|
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
|
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
|
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
|
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
|
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
|
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_mul_f32_e32
|
|
; GCN-DENORM-DAG: v_mul_f32_e32
|
|
; GCN-DENORM-DAG: v_mul_f32_e32
|
|
; GCN-DENORM-DAG: v_mul_f32_e32
|
|
|
|
; GCN-FLUSH: v_rcp_f32_e32 v[[OUT0:[0-9]+]], s[[VAL0]]
|
|
; GCN-FLUSH: v_rcp_f32_e32
|
|
; GCN-FLUSH: v_rcp_f32_e32
|
|
; GCN-FLUSH: v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]]
|
|
; GCN-FLUSH: global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off
|
|
define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
|
|
%load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
|
|
%neg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %load
|
|
%div = fdiv <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, %neg, !fpmath !0
|
|
store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}div_v4_c_by_x_25ulp:
|
|
; GCN-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000
|
|
; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
|
|
; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}}
|
|
; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}}
|
|
; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
|
|
; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
|
|
; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
|
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
|
; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
|
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
|
|
|
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]]
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP2]]
|
|
|
|
; GCN-DENORM-DAG: v_div_fmas_f32
|
|
; GCN-DENORM-DAG: v_div_fmas_f32
|
|
; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, 2.0{{$}}
|
|
; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}}
|
|
|
|
; GCN-FLUSH-DAG: v_rcp_f32_e32
|
|
; GCN-FLUSH-DAG: v_rcp_f32_e64
|
|
|
|
; GCN-NOT: v_cmp_gt_f32_e64
|
|
; GCN-NOT: v_cndmask_b32_e32
|
|
; GCN-FLUSH-NOT: v_div
|
|
|
|
; GCN: global_store_dwordx4
|
|
define amdgpu_kernel void @div_v4_c_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
|
|
%load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
|
|
%div = fdiv <4 x float> <float 2.000000e+00, float 1.000000e+00, float -1.000000e+00, float -2.000000e+00>, %load, !fpmath !0
|
|
store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}div_v4_c_by_minus_x_25ulp:
|
|
; GCN-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000
|
|
; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
|
|
; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
|
|
; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
|
|
; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
|
|
; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
|
|
; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
|
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
|
; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
|
|
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
|
|
|
|
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]]
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}}
|
|
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP2]]
|
|
|
|
; GCN-DENORM-DAG: v_div_fmas_f32
|
|
; GCN-DENORM-DAG: v_div_fmas_f32
|
|
; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}}
|
|
; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}}
|
|
|
|
; GCN-FLUSH-DAG: v_rcp_f32_e32
|
|
; GCN-FLUSH-DAG: v_rcp_f32_e64
|
|
|
|
; GCN-NOT: v_cmp_gt_f32_e64
|
|
; GCN-NOT: v_cndmask_b32_e32
|
|
; GCN-FLUSH-NOT: v_div
|
|
|
|
; GCN: global_store_dwordx4
|
|
define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
|
|
%load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
|
|
%neg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %load
|
|
%div = fdiv <4 x float> <float 2.000000e+00, float 1.000000e+00, float -1.000000e+00, float -2.000000e+00>, %neg, !fpmath !0
|
|
store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}div_v_by_x_25ulp:
|
|
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
|
|
|
|
; GCN-DENORM-DAG: v_div_scale_f32
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_div_scale_f32
|
|
; GCN-DENORM: v_div_fmas_f32
|
|
; GCN-DENORM: v_div_fixup_f32 [[OUT:v[0-9]+]],
|
|
|
|
; GCN-FLUSH-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
|
|
; GCN-FLUSH-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
|
|
; GCN-FLUSH-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
|
|
; GCN-FLUSH-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
|
|
; GCN-FLUSH: v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]]
|
|
; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
|
|
; GCN-FLUSH: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
|
|
|
|
; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off
|
|
define amdgpu_kernel void @div_v_by_x_25ulp(float addrspace(1)* %arg, float %num) {
|
|
%load = load float, float addrspace(1)* %arg, align 4
|
|
%div = fdiv float %num, %load, !fpmath !0
|
|
store float %div, float addrspace(1)* %arg, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}div_1_by_x_fast:
|
|
; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
|
|
; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
|
|
; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
|
|
define amdgpu_kernel void @div_1_by_x_fast(float addrspace(1)* %arg) {
|
|
%load = load float, float addrspace(1)* %arg, align 4
|
|
%div = fdiv fast float 1.000000e+00, %load
|
|
store float %div, float addrspace(1)* %arg, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}div_minus_1_by_x_fast:
|
|
; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
|
|
; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
|
|
; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
|
|
define amdgpu_kernel void @div_minus_1_by_x_fast(float addrspace(1)* %arg) {
|
|
%load = load float, float addrspace(1)* %arg, align 4
|
|
%div = fdiv fast float -1.000000e+00, %load
|
|
store float %div, float addrspace(1)* %arg, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}div_1_by_minus_x_fast:
|
|
; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
|
|
; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
|
|
; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
|
|
define amdgpu_kernel void @div_1_by_minus_x_fast(float addrspace(1)* %arg) {
|
|
%load = load float, float addrspace(1)* %arg, align 4
|
|
%neg = fsub float -0.000000e+00, %load
|
|
%div = fdiv fast float 1.000000e+00, %neg
|
|
store float %div, float addrspace(1)* %arg, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}div_minus_1_by_minus_x_fast:
|
|
; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
|
|
; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
|
|
; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
|
|
define amdgpu_kernel void @div_minus_1_by_minus_x_fast(float addrspace(1)* %arg) {
|
|
%load = load float, float addrspace(1)* %arg, align 4
|
|
%neg = fsub float -0.000000e+00, %load
|
|
%div = fdiv fast float -1.000000e+00, %neg
|
|
store float %div, float addrspace(1)* %arg, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}div_1_by_x_correctly_rounded:
|
|
; GCN-DENORM-DAG: v_div_scale_f32
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_div_scale_f32
|
|
; GCN-DENORM: v_div_fmas_f32
|
|
; GCN-DENORM: v_div_fixup_f32
|
|
|
|
; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
|
|
; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
|
|
; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
|
|
define amdgpu_kernel void @div_1_by_x_correctly_rounded(float addrspace(1)* %arg) {
|
|
%load = load float, float addrspace(1)* %arg, align 4
|
|
%div = fdiv float 1.000000e+00, %load
|
|
store float %div, float addrspace(1)* %arg, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}div_minus_1_by_x_correctly_rounded:
|
|
; GCN-DENORM-DAG: v_div_scale_f32
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_div_scale_f32
|
|
; GCN-DENORM: v_div_fmas_f32
|
|
; GCN-DENORM: v_div_fixup_f32
|
|
|
|
; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
|
|
; GCN-FLUSH: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
|
|
; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
|
|
define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(float addrspace(1)* %arg) {
|
|
%load = load float, float addrspace(1)* %arg, align 4
|
|
%div = fdiv float -1.000000e+00, %load
|
|
store float %div, float addrspace(1)* %arg, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}div_1_by_minus_x_correctly_rounded:
|
|
; GCN-DENORM-DAG: v_div_scale_f32
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_div_scale_f32
|
|
; GCN-DENORM: v_div_fmas_f32
|
|
; GCN-DENORM: v_div_fixup_f32
|
|
|
|
; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
|
|
; GCN-FLUSH: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
|
|
; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
|
|
define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) {
|
|
%load = load float, float addrspace(1)* %arg, align 4
|
|
%neg = fsub float -0.000000e+00, %load
|
|
%div = fdiv float 1.000000e+00, %neg
|
|
store float %div, float addrspace(1)* %arg, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}div_minus_1_by_minus_x_correctly_rounded:
|
|
; GCN-DENORM-DAG: v_div_scale_f32
|
|
; GCN-DENORM-DAG: v_rcp_f32_e32
|
|
; GCN-DENORM-DAG: v_div_scale_f32
|
|
; GCN-DENORM: v_div_fmas_f32
|
|
; GCN-DENORM: v_div_fixup_f32
|
|
|
|
; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
|
|
; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
|
|
; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
|
|
define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) {
|
|
%load = load float, float addrspace(1)* %arg, align 4
|
|
%neg = fsub float -0.000000e+00, %load
|
|
%div = fdiv float -1.000000e+00, %neg
|
|
store float %div, float addrspace(1)* %arg, align 4
|
|
ret void
|
|
}
|
|
|
|
!0 = !{float 2.500000e+00}
|