1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 10:42:39 +01:00
llvm-mirror/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll
Alexander Timofeev 0a1e1e6e9f [AMDGPU] Fix scalar operand folding bug that causes SHOC performance regression.
Detailed description: SIFoldOperands::foldInstOperand iterates over the
operand uses calling the function that changes def-use iteratorson the
way. As a result loop exits immediately when def-use iterator is
changed. Hence, the operand is folded to the very first use instruction
only. This makes VGPR live along the whole basic block and increases
register pressure significantly. The performance drop observed in SHOC
DeviceMemory test is caused by this bug.

Proposed fix: collect uses to separate container for further processing
in another loop.

Testing: make check-llvm
SHOC performance test.

Reviewers: rampitec, ronlieb

Differential Revision: https://reviews.llvm.org/D56161

llvm-svn: 350350
2019-01-03 19:55:32 +00:00

460 lines
21 KiB
LLVM

; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+fp32-denormals < %s | FileCheck --check-prefixes=GCN,GCN-DENORM %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-fp32-denormals < %s | FileCheck --check-prefixes=GCN,GCN-FLUSH %s
; GCN-LABEL: {{^}}div_1_by_x_25ulp:
; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
; GCN-DENORM: v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]]
; GCN-DENORM: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
; GCN-DENORM: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
; GCN-FLUSH: v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]]
; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off
define amdgpu_kernel void @div_1_by_x_25ulp(float addrspace(1)* %arg) {
%load = load float, float addrspace(1)* %arg, align 4
%div = fdiv float 1.000000e+00, %load, !fpmath !0
store float %div, float addrspace(1)* %arg, align 4
ret void
}
; GCN-LABEL: {{^}}div_minus_1_by_x_25ulp:
; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
; GCN-DENORM: v_mul_f32_e64 [[PRESCALED:v[0-9]+]], [[VAL]], -[[SCALE]]
; GCN-DENORM: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
; GCN-DENORM: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
; GCN-FLUSH: v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]]
; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off
define amdgpu_kernel void @div_minus_1_by_x_25ulp(float addrspace(1)* %arg) {
%load = load float, float addrspace(1)* %arg, align 4
%div = fdiv float -1.000000e+00, %load, !fpmath !0
store float %div, float addrspace(1)* %arg, align 4
ret void
}
; GCN-LABEL: {{^}}div_1_by_minus_x_25ulp:
; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
; GCN-DENORM: v_mul_f32_e64 [[PRESCALED:v[0-9]+]], -[[VAL]], [[SCALE]]
; GCN-DENORM: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
; GCN-DENORM: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
; GCN-FLUSH: v_rcp_f32_e64 [[OUT:v[0-9]+]], -[[VAL]]
; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off
define amdgpu_kernel void @div_1_by_minus_x_25ulp(float addrspace(1)* %arg) {
%load = load float, float addrspace(1)* %arg, align 4
%neg = fsub float -0.000000e+00, %load
%div = fdiv float 1.000000e+00, %neg, !fpmath !0
store float %div, float addrspace(1)* %arg, align 4
ret void
}
; GCN-LABEL: {{^}}div_minus_1_by_minus_x_25ulp:
; GCN-DENORM-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
; GCN-DENORM-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
; GCN-DENORM: v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]]
; GCN-DENORM: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
; GCN-DENORM: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
; GCN-FLUSH: v_rcp_f32_e32 [[OUT:v[0-9]+]], [[VAL]]
; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off
define amdgpu_kernel void @div_minus_1_by_minus_x_25ulp(float addrspace(1)* %arg) {
%load = load float, float addrspace(1)* %arg, align 4
%neg = fsub float -0.000000e+00, %load
%div = fdiv float -1.000000e+00, %neg, !fpmath !0
store float %div, float addrspace(1)* %arg, align 4
ret void
}
; GCN-LABEL: {{^}}div_v4_1_by_x_25ulp:
; GCN-DAG: s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000
; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_mul_f32_e32
; GCN-DENORM-DAG: v_mul_f32_e32
; GCN-DENORM-DAG: v_mul_f32_e32
; GCN-DENORM-DAG: v_mul_f32_e32
; GCN-FLUSH: v_rcp_f32_e32 v[[OUT0:[0-9]+]], s[[VAL0]]
; GCN-FLUSH: v_rcp_f32_e32
; GCN-FLUSH: v_rcp_f32_e32
; GCN-FLUSH: v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]]
; GCN-FLUSH: global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off
define amdgpu_kernel void @div_v4_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
%load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
%div = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %load, !fpmath !0
store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
ret void
}
; GCN-LABEL: {{^}}div_v4_minus_1_by_x_25ulp:
; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000
; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_mul_f32_e32
; GCN-DENORM-DAG: v_mul_f32_e32
; GCN-DENORM-DAG: v_mul_f32_e32
; GCN-DENORM-DAG: v_mul_f32_e32
; GCN-FLUSH: v_rcp_f32_e64 v[[OUT0:[0-9]+]], -s[[VAL0]]
; GCN-FLUSH: v_rcp_f32_e64
; GCN-FLUSH: v_rcp_f32_e64
; GCN-FLUSH: v_rcp_f32_e64 v[[OUT3:[0-9]+]], -s[[VAL3]]
define amdgpu_kernel void @div_v4_minus_1_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
%load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
%div = fdiv <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, %load, !fpmath !0
store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
ret void
}
; GCN-LABEL: {{^}}div_v4_1_by_minus_x_25ulp:
; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000
; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_mul_f32_e32
; GCN-DENORM-DAG: v_mul_f32_e32
; GCN-DENORM-DAG: v_mul_f32_e32
; GCN-DENORM-DAG: v_mul_f32_e32
; GCN-FLUSH: v_rcp_f32_e64 v[[OUT0:[0-9]+]], -s[[VAL0]]
; GCN-FLUSH: v_rcp_f32_e64
; GCN-FLUSH: v_rcp_f32_e64
; GCN-FLUSH: v_rcp_f32_e64 v[[OUT3:[0-9]+]], -s[[VAL3]]
; GCN-FLUSH: global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off
define amdgpu_kernel void @div_v4_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
%load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
%neg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %load
%div = fdiv <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>, %neg, !fpmath !0
store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
ret void
}
; GCN-LABEL: {{^}}div_v4_minus_1_by_minus_x_25ulp:
; GCN-DAG: s_load_dwordx4 s{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]], s[{{[0-9:]+}}], 0x0{{$}}
; GCN-DENORM-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000
; GCN-DENORM-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
; GCN-DENORM-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
; GCN-DENORM-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_mul_f32_e32
; GCN-DENORM-DAG: v_mul_f32_e32
; GCN-DENORM-DAG: v_mul_f32_e32
; GCN-DENORM-DAG: v_mul_f32_e32
; GCN-FLUSH: v_rcp_f32_e32 v[[OUT0:[0-9]+]], s[[VAL0]]
; GCN-FLUSH: v_rcp_f32_e32
; GCN-FLUSH: v_rcp_f32_e32
; GCN-FLUSH: v_rcp_f32_e32 v[[OUT3:[0-9]+]], s[[VAL3]]
; GCN-FLUSH: global_store_dwordx4 v[{{[0-9:]+}}], v{{\[}}[[OUT0]]:[[OUT3]]], off
define amdgpu_kernel void @div_v4_minus_1_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
%load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
%neg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %load
%div = fdiv <4 x float> <float -1.000000e+00, float -1.000000e+00, float -1.000000e+00, float -1.000000e+00>, %neg, !fpmath !0
store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
ret void
}
; GCN-LABEL: {{^}}div_v4_c_by_x_25ulp:
; GCN-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000
; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}}
; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}}
; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, -v{{[0-9]+}}
; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}}
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]]
; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}}
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP2]]
; GCN-DENORM-DAG: v_div_fmas_f32
; GCN-DENORM-DAG: v_div_fmas_f32
; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, 2.0{{$}}
; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}}
; GCN-FLUSH-DAG: v_rcp_f32_e32
; GCN-FLUSH-DAG: v_rcp_f32_e64
; GCN-NOT: v_cmp_gt_f32_e64
; GCN-NOT: v_cndmask_b32_e32
; GCN-FLUSH-NOT: v_div
; GCN: global_store_dwordx4
define amdgpu_kernel void @div_v4_c_by_x_25ulp(<4 x float> addrspace(1)* %arg) {
%load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
%div = fdiv <4 x float> <float 2.000000e+00, float 1.000000e+00, float -1.000000e+00, float -2.000000e+00>, %load, !fpmath !0
store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
ret void
}
; GCN-LABEL: {{^}}div_v4_c_by_minus_x_25ulp:
; GCN-DAG: s_mov_b32 [[L:s[0-9]+]], 0x6f800000
; GCN-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}}
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
; GCN-DAG: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, [[L]]
; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[S]], vcc
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GCN-DENORM-DAG: v_mul_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}, v{{[0-9]+}}
; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP1:v[0-9]+]], v{{[0-9]+}}
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP1]]
; GCN-DENORM-DAG: v_rcp_f32_e32 [[RCP2:v[0-9]+]], v{{[0-9]+}}
; GCN-DENORM-DAG: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[RCP2]]
; GCN-DENORM-DAG: v_div_fmas_f32
; GCN-DENORM-DAG: v_div_fmas_f32
; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}}
; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}}
; GCN-FLUSH-DAG: v_rcp_f32_e32
; GCN-FLUSH-DAG: v_rcp_f32_e64
; GCN-NOT: v_cmp_gt_f32_e64
; GCN-NOT: v_cndmask_b32_e32
; GCN-FLUSH-NOT: v_div
; GCN: global_store_dwordx4
define amdgpu_kernel void @div_v4_c_by_minus_x_25ulp(<4 x float> addrspace(1)* %arg) {
%load = load <4 x float>, <4 x float> addrspace(1)* %arg, align 16
%neg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %load
%div = fdiv <4 x float> <float 2.000000e+00, float 1.000000e+00, float -1.000000e+00, float -2.000000e+00>, %neg, !fpmath !0
store <4 x float> %div, <4 x float> addrspace(1)* %arg, align 16
ret void
}
; GCN-LABEL: {{^}}div_v_by_x_25ulp:
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9:]+}}], 0x0{{$}}
; GCN-DENORM-DAG: v_div_scale_f32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_div_scale_f32
; GCN-DENORM: v_div_fmas_f32
; GCN-DENORM: v_div_fixup_f32 [[OUT:v[0-9]+]],
; GCN-FLUSH-DAG: v_mov_b32_e32 [[L:v[0-9]+]], 0x6f800000
; GCN-FLUSH-DAG: v_mov_b32_e32 [[S:v[0-9]+]], 0x2f800000
; GCN-FLUSH-DAG: v_cmp_gt_f32_e64 vcc, |[[VAL]]|, [[L]]
; GCN-FLUSH-DAG: v_cndmask_b32_e32 [[SCALE:v[0-9]+]], 1.0, [[S]], vcc
; GCN-FLUSH: v_mul_f32_e32 [[PRESCALED:v[0-9]+]], [[VAL]], [[SCALE]]
; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[PRESCALED]]
; GCN-FLUSH: v_mul_f32_e32 [[OUT:v[0-9]+]], [[SCALE]], [[RCP]]
; GCN: global_store_dword v[{{[0-9:]+}}], [[OUT]], off
define amdgpu_kernel void @div_v_by_x_25ulp(float addrspace(1)* %arg, float %num) {
%load = load float, float addrspace(1)* %arg, align 4
%div = fdiv float %num, %load, !fpmath !0
store float %div, float addrspace(1)* %arg, align 4
ret void
}
; GCN-LABEL: {{^}}div_1_by_x_fast:
; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
define amdgpu_kernel void @div_1_by_x_fast(float addrspace(1)* %arg) {
%load = load float, float addrspace(1)* %arg, align 4
%div = fdiv fast float 1.000000e+00, %load
store float %div, float addrspace(1)* %arg, align 4
ret void
}
; GCN-LABEL: {{^}}div_minus_1_by_x_fast:
; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
define amdgpu_kernel void @div_minus_1_by_x_fast(float addrspace(1)* %arg) {
%load = load float, float addrspace(1)* %arg, align 4
%div = fdiv fast float -1.000000e+00, %load
store float %div, float addrspace(1)* %arg, align 4
ret void
}
; GCN-LABEL: {{^}}div_1_by_minus_x_fast:
; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
define amdgpu_kernel void @div_1_by_minus_x_fast(float addrspace(1)* %arg) {
%load = load float, float addrspace(1)* %arg, align 4
%neg = fsub float -0.000000e+00, %load
%div = fdiv fast float 1.000000e+00, %neg
store float %div, float addrspace(1)* %arg, align 4
ret void
}
; GCN-LABEL: {{^}}div_minus_1_by_minus_x_fast:
; GCN: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
; GCN: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
define amdgpu_kernel void @div_minus_1_by_minus_x_fast(float addrspace(1)* %arg) {
%load = load float, float addrspace(1)* %arg, align 4
%neg = fsub float -0.000000e+00, %load
%div = fdiv fast float -1.000000e+00, %neg
store float %div, float addrspace(1)* %arg, align 4
ret void
}
; GCN-LABEL: {{^}}div_1_by_x_correctly_rounded:
; GCN-DENORM-DAG: v_div_scale_f32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_div_scale_f32
; GCN-DENORM: v_div_fmas_f32
; GCN-DENORM: v_div_fixup_f32
; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
define amdgpu_kernel void @div_1_by_x_correctly_rounded(float addrspace(1)* %arg) {
%load = load float, float addrspace(1)* %arg, align 4
%div = fdiv float 1.000000e+00, %load
store float %div, float addrspace(1)* %arg, align 4
ret void
}
; GCN-LABEL: {{^}}div_minus_1_by_x_correctly_rounded:
; GCN-DENORM-DAG: v_div_scale_f32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_div_scale_f32
; GCN-DENORM: v_div_fmas_f32
; GCN-DENORM: v_div_fixup_f32
; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
; GCN-FLUSH: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
define amdgpu_kernel void @div_minus_1_by_x_correctly_rounded(float addrspace(1)* %arg) {
%load = load float, float addrspace(1)* %arg, align 4
%div = fdiv float -1.000000e+00, %load
store float %div, float addrspace(1)* %arg, align 4
ret void
}
; GCN-LABEL: {{^}}div_1_by_minus_x_correctly_rounded:
; GCN-DENORM-DAG: v_div_scale_f32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_div_scale_f32
; GCN-DENORM: v_div_fmas_f32
; GCN-DENORM: v_div_fixup_f32
; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
; GCN-FLUSH: v_rcp_f32_e64 [[RCP:v[0-9]+]], -[[VAL]]
; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
define amdgpu_kernel void @div_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) {
%load = load float, float addrspace(1)* %arg, align 4
%neg = fsub float -0.000000e+00, %load
%div = fdiv float 1.000000e+00, %neg
store float %div, float addrspace(1)* %arg, align 4
ret void
}
; GCN-LABEL: {{^}}div_minus_1_by_minus_x_correctly_rounded:
; GCN-DENORM-DAG: v_div_scale_f32
; GCN-DENORM-DAG: v_rcp_f32_e32
; GCN-DENORM-DAG: v_div_scale_f32
; GCN-DENORM: v_div_fmas_f32
; GCN-DENORM: v_div_fixup_f32
; GCN-FLUSH: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x0
; GCN-FLUSH: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[VAL]]
; GCN-FLUSH: global_store_dword v[{{[0-9:]+}}], [[RCP]], off
define amdgpu_kernel void @div_minus_1_by_minus_x_correctly_rounded(float addrspace(1)* %arg) {
%load = load float, float addrspace(1)* %arg, align 4
%neg = fsub float -0.000000e+00, %load
%div = fdiv float -1.000000e+00, %neg
store float %div, float addrspace(1)* %arg, align 4
ret void
}
!0 = !{float 2.500000e+00}