mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
33f5f7933c
If f32 denormals were enabled pre-gfx9, we would still try to implement this with v_max_f32. Pre-gfx9, these instructions ignored the denormal mode and did not flush. Switch to the multiply form for f32 as a workaround which should always work in any case. This fixes conformance failures when the library implementation of fmin/fmax were accidentally not inlined, forcing the assumption of no flushing on targets where denormals are not enabled by default. This is a workaround, since really we should not be mixing code with different FP mode expectations, but prefer the lowering that will work in any mode. Now this will always use max to implement canonicalize on gfx9+. This is only really beneficial for f64. For f32/f16 it's a neutral choice (and worse in terms of code size in 1 case), but possibly worse for the compiler since it does add an extra register use operand. Leave this change for later.
118 lines
4.4 KiB
LLVM
118 lines
4.4 KiB
LLVM
; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX678,SI %s
|
|
; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX678,VI %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
|
|
|
|
declare double @llvm.minnum.f64(double, double) #0
|
|
declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) #0
|
|
declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>) #0
|
|
declare <8 x double> @llvm.minnum.v8f64(<8 x double>, <8 x double>) #0
|
|
declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>) #0
|
|
|
|
; GCN-LABEL: {{^}}test_fmin_f64_ieee_noflush:
|
|
; GCN: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]]
|
|
; GCN: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]]
|
|
|
|
; GCN-DAG: v_max_f64 [[QUIETA:v\[[0-9]+:[0-9]+\]]], [[A]], [[A]]
|
|
; GCN-DAG: v_max_f64 [[QUIETB:v\[[0-9]+:[0-9]+\]]], [[B]], [[B]]
|
|
|
|
; GCN: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[QUIETA]], [[QUIETB]]
|
|
define amdgpu_kernel void @test_fmin_f64_ieee_noflush([8 x i32], double %a, [8 x i32], double %b) #1 {
|
|
%val = call double @llvm.minnum.f64(double %a, double %b) #0
|
|
store double %val, double addrspace(1)* undef, align 8
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_fmin_f64_ieee_flush:
|
|
; GCN: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]]
|
|
; GCN: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]]
|
|
; GFX678-DAG: v_mul_f64 [[QUIETA:v\[[0-9]+:[0-9]+\]]], 1.0, [[A]]
|
|
; GFX678-DAG: v_mul_f64 [[QUIETB:v\[[0-9]+:[0-9]+\]]], 1.0, [[B]]
|
|
|
|
; GFX9-DAG: v_max_f64 [[QUIETA:v\[[0-9]+:[0-9]+\]]], [[A]], [[A]]
|
|
; GFX9-DAG: v_max_f64 [[QUIETB:v\[[0-9]+:[0-9]+\]]], [[B]], [[B]]
|
|
|
|
; GCN: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[QUIETA]], [[QUIETB]]
|
|
define amdgpu_kernel void @test_fmin_f64_ieee_flush([8 x i32], double %a, [8 x i32], double %b) #2 {
|
|
%val = call double @llvm.minnum.f64(double %a, double %b) #0
|
|
store double %val, double addrspace(1)* undef, align 8
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_fmin_f64_no_ieee:
|
|
; GCN: ds_read_b64 [[VAL0:v\[[0-9]+:[0-9]+\]]]
|
|
; GCN: ds_read_b64 [[VAL1:v\[[0-9]+:[0-9]+\]]]
|
|
; GCN-NOT: [[VAL0]]
|
|
; GCN-NOT: [[VAL1]]
|
|
; GCN: v_min_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VAL0]], [[VAL1]]
|
|
; GCN-NOT: [[RESULT]]
|
|
; GCN: ds_write_b64 v{{[0-9]+}}, [[RESULT]]
|
|
define amdgpu_ps void @test_fmin_f64_no_ieee() nounwind {
|
|
%a = load volatile double, double addrspace(3)* undef
|
|
%b = load volatile double, double addrspace(3)* undef
|
|
%val = call double @llvm.minnum.f64(double %a, double %b) #0
|
|
store volatile double %val, double addrspace(3)* undef
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_fmin_v2f64:
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
define amdgpu_kernel void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
|
|
%val = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b) #0
|
|
store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_fmin_v4f64:
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
define amdgpu_kernel void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
|
|
%val = call <4 x double> @llvm.minnum.v4f64(<4 x double> %a, <4 x double> %b) #0
|
|
store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_fmin_v8f64:
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
define amdgpu_kernel void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
|
|
%val = call <8 x double> @llvm.minnum.v8f64(<8 x double> %a, <8 x double> %b) #0
|
|
store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_fmin_v16f64:
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
; GCN: v_min_f64
|
|
define amdgpu_kernel void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
|
|
%val = call <16 x double> @llvm.minnum.v16f64(<16 x double> %a, <16 x double> %b) #0
|
|
store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128
|
|
ret void
|
|
}
|
|
|
|
attributes #0 = { nounwind readnone }
|
|
attributes #1 = { nounwind "denormal-fp-math"="ieee,ieee" }
|
|
attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
|