1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-26 12:43:36 +01:00
llvm-mirror/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
Matt Arsenault bc7384fc93 DAG: Change behavior of fminnum/fmaxnum nodes
Introduce new versions that follow the IEEE semantics
to help with legalization that may need quieted inputs.

There are some regressions from inserting unnecessary
canonicalizes when these are matched from fast math
fcmp + select which should be fixed in a future commit.

llvm-svn: 344914
2018-10-22 16:27:27 +00:00

913 lines
40 KiB
LLVM

; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-FLUSH,GCN-FLUSH,GCN-NOEXCEPT %s
; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=-fp32-denormals,+fp-exceptions < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-EXCEPT,VI,VI-FLUSH,GCN-FLUSH %s
; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-DENORM,GCN-DENORM,GCN-NOEXCEPT %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-DENORM,GCN-DENORM,GCN-NOEXCEPT %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLUSH,GCN-FLUSH,GCN-NOEXCEPT %s
; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32:
; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
; GFX9-DENORM: v_max_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%v = load float, float addrspace(1)* %gep, align 4
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_value_f32:
; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
; GCN-NOT: 1.0
define amdgpu_kernel void @test_fold_canonicalize_fmul_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = fmul float %load, 15.0
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_legacy_value_f32:
; GCN: v_mul_legacy_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fmul_legacy_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = call float @llvm.amdgcn.fmul.legacy(float %load, float 15.0)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_sub_value_f32:
; GCN: v_sub_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = fsub float 15.0, %load
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_add_value_f32:
; GCN: v_add_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = fadd float %load, 15.0
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_sqrt_value_f32:
; GCN: v_sqrt_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = call float @llvm.sqrt.f32(float %load)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: test_fold_canonicalize_fceil_value_f32:
; GCN: v_ceil_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = call float @llvm.ceil.f32(float %load)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: test_fold_canonicalize_floor_value_f32:
; GCN: v_floor_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = call float @llvm.floor.f32(float %load)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: test_fold_canonicalize_fma_value_f32:
; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x41700000
; GCN: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SREG]], [[SREG]]
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = call float @llvm.fma.f32(float %load, float 15.0, float 15.0)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: test_fold_canonicalize_fmad_ftz_value_f32:
; GCN: s_mov_b32 [[SGPR:s[0-9]+]], 0x41700000
; GCN: v_mad_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SGPR]], [[SGPR]]
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fmad_ftz_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = call float @llvm.amdgcn.fmad.ftz.f32(float %load, float 15.0, float 15.0)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32:
; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
; GCN-DENORM: s_mov_b32 [[SREG:s[0-9]+]], 0x41700000
; GCN-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SREG]], [[SREG]]
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
; GCN-NOT: 1.0
define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = call float @llvm.fmuladd.f32(float %load, float 15.0, float 15.0)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: test_fold_canonicalize_canonicalize_value_f32:
; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]],
; GCN-FLUSH: v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]]
; GCN-DENORM: v_max_f32_e32 [[V:v[0-9]+]], [[LOAD]], [[LOAD]]
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = call float @llvm.canonicalize.f32(float %load)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: test_fold_canonicalize_fpextend_value_f64_f32:
; GCN: v_cvt_f64_f32_e32 [[V:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(float addrspace(1)* %arg, double addrspace(1)* %out) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = fpext float %load to double
%canonicalized = tail call double @llvm.canonicalize.f64(double %v)
%gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
store double %canonicalized, double addrspace(1)* %gep2, align 8
ret void
}
; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16:
; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}}
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(half addrspace(1)* %arg, float addrspace(1)* %out) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
%load = load half, half addrspace(1)* %gep, align 2
%v = fpext half %load to float
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
%gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
store float %canonicalized, float addrspace(1)* %gep2, align 4
ret void
}
; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16_flushf16:
; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}}
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16_flushf16(half addrspace(1)* %arg, float addrspace(1)* %out) #2 {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
%load = load half, half addrspace(1)* %gep, align 2
%v = fpext half %load to float
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
%gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
store float %canonicalized, float addrspace(1)* %gep2, align 4
ret void
}
; GCN-LABEL: test_fold_canonicalize_fpround_value_f32_f64:
; GCN: v_cvt_f32_f64_e32 [[V:v[0-9]+]], v[{{[0-9:]+}}]
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(double addrspace(1)* %arg, float addrspace(1)* %out) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
%load = load double, double addrspace(1)* %gep, align 8
%v = fptrunc double %load to float
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
%gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
store float %canonicalized, float addrspace(1)* %gep2, align 4
ret void
}
; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32:
; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
; GCN-NOT: v_max
; GCN-NOT: v_mul
; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(float addrspace(1)* %arg, half addrspace(1)* %out) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = fptrunc float %load to half
%canonicalized = tail call half @llvm.canonicalize.f16(half %v)
%gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
store half %canonicalized, half addrspace(1)* %gep2, align 2
ret void
}
; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32_flushf16:
; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
; GCN-NOT: v_max
; GCN-NOT: v_mul
; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32_flushf16(float addrspace(1)* %arg, half addrspace(1)* %out) #2 {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = fptrunc float %load to half
%canonicalized = tail call half @llvm.canonicalize.f16(half %v)
%gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
store half %canonicalized, half addrspace(1)* %gep2, align 2
ret void
}
; GCN-LABEL: test_fold_canonicalize_fpround_value_v2f16_v2f32:
; GCN-DAG: v_cvt_f16_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
; VI-DAG: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}}
; VI: v_or_b32_e32 [[V:v[0-9]+]], [[V0]], [[V1]]
; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}}
; GFX9: v_and_b32_e32 [[V0_16:v[0-9]+]], 0xffff, [[V0]]
; GFX9: v_lshl_or_b32 [[V:v[0-9]+]], [[V1]], 16, [[V0_16]]
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(<2 x float> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %arg, i32 %id
%load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
%v = fptrunc <2 x float> %load to <2 x half>
%canonicalized = tail call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %v)
%gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i32 %id
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %gep2, align 4
ret void
}
; GCN-LABEL: test_no_fold_canonicalize_fneg_value_f32:
; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, -1.0, v{{[0-9]+}}
; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = fsub float -0.0, %load
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: test_fold_canonicalize_fneg_value_f32:
; GCN: v_xor_b32_e32 [[V:v[0-9]+]], 0x80000000, v{{[0-9]+}}
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v0 = fadd float %load, 0.0
%v = fsub float -0.0, %v0
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: test_no_fold_canonicalize_fabs_value_f32:
; GCN-FLUSH: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = tail call float @llvm.fabs.f32(float %load)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: test_no_fold_canonicalize_fcopysign_value_f32:
; GCN-FLUSH: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
; GCN-NOT: v_mul_
; GCN-NOT: v_max_
define amdgpu_kernel void @test_no_fold_canonicalize_fcopysign_value_f32(float addrspace(1)* %arg, float %sign) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%canon.load = tail call float @llvm.canonicalize.f32(float %load)
%copysign = call float @llvm.copysign.f32(float %canon.load, float %sign)
%v = tail call float @llvm.fabs.f32(float %load)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: test_fold_canonicalize_fabs_value_f32:
; GCN: v_and_b32_e32 [[V:v[0-9]+]], 0x7fffffff, v{{[0-9]+}}
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v0 = fadd float %load, 0.0
%v = tail call float @llvm.fabs.f32(float %v0)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: test_fold_canonicalize_sin_value_f32:
; GCN: v_sin_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = tail call float @llvm.sin.f32(float %load)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: test_fold_canonicalize_cos_value_f32:
; GCN: v_cos_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = tail call float @llvm.cos.f32(float %load)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: test_fold_canonicalize_sin_value_f16:
; GCN: v_sin_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]]
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(half addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
%load = load half, half addrspace(1)* %gep, align 2
%v = tail call half @llvm.sin.f16(half %load)
%canonicalized = tail call half @llvm.canonicalize.f16(half %v)
store half %canonicalized, half addrspace(1)* %gep, align 2
ret void
}
; GCN-LABEL: test_fold_canonicalize_cos_value_f16:
; GCN: v_cos_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]]
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(half addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
%load = load half, half addrspace(1)* %gep, align 2
%v = tail call half @llvm.cos.f16(half %load)
%canonicalized = tail call half @llvm.canonicalize.f16(half %v)
store half %canonicalized, half addrspace(1)* %gep, align 2
ret void
}
; GCN-LABEL: test_fold_canonicalize_qNaN_value_f32:
; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x7fc00000
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%canonicalized = tail call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode:
; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
; GCN-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
; GCN-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, [[QUIET]]
; GCN-NOT: v_max
; GCN-NOT: v_mul
; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = tail call float @llvm.minnum.f32(float %load, float 0.0)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode:
; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
; GCN-DENORM-NOT: v_max
; GCN-DENORM-NOT: v_mul
; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
; GCN-DENORM-NOT: v_max
; GCN-DENORM-NOT: v_mul
; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}]
define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(float addrspace(1)* %arg) #1 {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = tail call float @llvm.minnum.f32(float %load, float 0.0)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: test_fold_canonicalize_minnum_value_f32:
; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v0 = fadd float %load, 0.0
%v = tail call float @llvm.minnum.f32(float %v0, float 0.0)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; FIXME: Should there be more checks here? minnum with NaN operand is simplified away.
; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32:
; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]]
; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[LOAD]]
; GCN-DENORM: v_max_f32_e32 v{{[0-9]+}}, [[LOAD]], [[LOAD]]
define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 2139095041 to float))
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
; GFX9-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
; GFX9-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[QUIET]]
; GFX9-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
; GFX9-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]
; VI-FLUSH: v_mul_f32_e32 [[QUIET_V0:v[0-9]+]], 1.0, [[VAL]]
; VI-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_V0]]
; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[VAL]]
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[RESULT]]
define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float))
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode:
; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
; GFX9: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]]
; VI-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
; VI-FLUSH: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]
; VI-DENORM: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]]
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[RESULT]]
define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v = tail call float @llvm.maxnum.f32(float %load, float 0.0)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: test_fold_canonicalize_maxnum_value_f32:
; GCN: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
; GCN-NOT: v_max
; GCN-NOT: v_mul
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load = load float, float addrspace(1)* %gep, align 4
%v0 = fadd float %load, 0.0
%v = tail call float @llvm.maxnum.f32(float %v0, float 0.0)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; GCN-LABEL: test_fold_canonicalize_maxnum_value_f64:
; GCN: v_max_f64 [[V:v\[[0-9]+:[0-9]+\]]], v[{{[0-9:]+}}], 0
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]]
define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(double addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
%load = load double, double addrspace(1)* %gep, align 8
%v0 = fadd double %load, 0.0
%v = tail call double @llvm.maxnum.f64(double %v0, double 0.0)
%canonicalized = tail call double @llvm.canonicalize.f64(double %v)
store double %canonicalized, double addrspace(1)* %gep, align 8
ret void
}
; GCN-LABEL: test_fold_canonicalize_fmul_value_f32_no_ieee:
; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN-NEXT: ; return
define amdgpu_ps float @test_fold_canonicalize_fmul_value_f32_no_ieee(float %arg) {
entry:
%v = fmul float %arg, 15.0
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
ret float %canonicalized
}
; GCN-LABEL: test_fold_canonicalize_fmul_nnan_value_f32_no_ieee:
; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN-NEXT: ; return
define amdgpu_ps float @test_fold_canonicalize_fmul_nnan_value_f32_no_ieee(float %arg) {
entry:
%v = fmul nnan float %arg, 15.0
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
ret float %canonicalized
}
; GCN-LABEL: {{^}}test_fold_canonicalize_fdiv_value_f32_no_ieee:
; GCN: v_div_fixup_f32
; GCN-NOT: v_max
; GCN-NOT: v_mul
; GCN: ; return
define amdgpu_ps float @test_fold_canonicalize_fdiv_value_f32_no_ieee(float %arg0) {
entry:
%v = fdiv float 15.0, %arg0
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
ret float %canonicalized
}
; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f32
; GFX9-DENORM: global_load_dword [[V:v[0-9]+]],
; GFX9-DENORM: global_store_dword v[{{[0-9:]+}}], [[V]]
; GFX9-DENORM-NOT: 1.0
; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f32(float addrspace(1)* %arg, float addrspace(1)* %out) #1 {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%v = load float, float addrspace(1)* %gep, align 4
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
%gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
store float %canonicalized, float addrspace(1)* %gep2, align 4
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f64
; GCN: {{flat|global}}_load_dwordx2 [[V:v\[[0-9:]+\]]],
; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]]
; GCN-NOT: v_mul_
; GCN-NOT: v_max_
define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(double addrspace(1)* %arg, double addrspace(1)* %out) #1 {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
%v = load double, double addrspace(1)* %gep, align 8
%canonicalized = tail call double @llvm.canonicalize.f64(double %v)
%gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
store double %canonicalized, double addrspace(1)* %gep2, align 8
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16
; GCN: {{flat|global}}_load_ushort [[V:v[0-9]+]],
; GCN-NOT: v_mul
; GCN-NOT: v_max
; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V]]
define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(half addrspace(1)* %arg, half addrspace(1)* %out) #1 {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
%v = load half, half addrspace(1)* %gep, align 2
%canonicalized = tail call half @llvm.canonicalize.f16(half %v)
%gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
store half %canonicalized, half addrspace(1)* %gep2, align 2
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_select_value_f32:
; GCN: v_add_f32
; GCN: v_add_f32
; GCN: v_cndmask_b32
; GCN-NOT: v_mul_
; GCN-NOT: v_max_
define amdgpu_kernel void @test_fold_canonicalize_select_value_f32(float addrspace(1)* %arg) {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
%load0 = load volatile float, float addrspace(1)* %gep, align 4
%load1 = load volatile float, float addrspace(1)* %gep, align 4
%load2 = load volatile i32, i32 addrspace(1)* undef, align 4
%v0 = fadd float %load0, 15.0
%v1 = fadd float %load1, 32.0
%cond = icmp eq i32 %load2, 0
%select = select i1 %cond, float %v0, float %v1
%canonicalized = tail call float @llvm.canonicalize.f32(float %select)
store float %canonicalized, float addrspace(1)* %gep, align 4
ret void
}
; Need to quiet the nan with a separate instruction since it will be
; passed through the minnum.
; FIXME: canonicalize doens't work correctly without ieee_mode
; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode:
; GFX9-NOT: v0
; GFX9-NOT: v1
; GFX9: v_min_f32_e32 v0, v0, v1
; GFX9-NEXT: ; return to shader
; VI-FLUSH: v_min_f32_e32 v0, v0, v1
; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
; VI-FLUSH-NEXT: ; return
; VI-DENORM-NOT: v0
; VI-DENORM: v_min_f32_e32 v0, v0, v1
; VI-DENORM-NEXT: ; return
define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode(float %arg0, float %arg1) {
%v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
ret float %canonicalized
}
; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_ieee_mode:
; GFX9: v_min_f32_e32 v0, v0, v1
; GFX9-NEXT: s_setpc_b64
; VI-FLUSH-DAG: v_mul_f32_e32 v0, 1.0, v0
; VI-FLUSH-DAG: v_mul_f32_e32 v1, 1.0, v1
; VI-FLUSH: v_min_f32_e32 v0, v0, v1
; VI-DENORM-DAG: v_max_f32_e32 v0, v0, v0
; VI-DENORM-DAG: v_max_f32_e32 v1, v1, v1
; VI-DENORM: v_min_f32_e32 v0, v0, v1
; VI-NEXT: s_setpc_b64
define float @test_fold_canonicalize_minnum_value_ieee_mode(float %arg0, float %arg1) {
%v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
ret float %canonicalized
}
; Canonicalizing flush necessary pre-gfx9
; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode_nnan:
; GCN: v_min_f32_e32 v0, v0, v1
; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: ; return
define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode_nnan(float %arg0, float %arg1) #1 {
%v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
ret float %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_v2f16:
; GFX9-DAG: v_add_f16_e32
; GFX9-DAG: v_mul_f16_e32
; GFX9-NOT: v_max
; GFX9-NOT: v_pk_max
define <2 x half> @v_test_canonicalize_build_vector_v2f16(<2 x half> %vec) {
%lo = extractelement <2 x half> %vec, i32 0
%hi = extractelement <2 x half> %vec, i32 1
%lo.op = fadd half %lo, 1.0
%hi.op = fmul half %lo, 4.0
%ins0 = insertelement <2 x half> undef, half %lo.op, i32 0
%ins1 = insertelement <2 x half> %ins0, half %hi.op, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1)
ret <2 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_noncanon1_v2f16:
; GFX9: v_add_f16_e32
; GFX9: v_pk_max
define <2 x half> @v_test_canonicalize_build_vector_noncanon1_v2f16(<2 x half> %vec) {
%lo = extractelement <2 x half> %vec, i32 0
%lo.op = fadd half %lo, 1.0
%ins = insertelement <2 x half> %vec, half %lo.op, i32 0
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
ret <2 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_noncanon0_v2f16:
; GFX9: v_add_f16_sdwa
; GFX9: v_pk_max
define <2 x half> @v_test_canonicalize_build_vector_noncanon0_v2f16(<2 x half> %vec) {
%hi = extractelement <2 x half> %vec, i32 1
%hi.op = fadd half %hi, 1.0
%ins = insertelement <2 x half> %vec, half %hi.op, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
ret <2 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_extract_element_v2f16:
; GFX9: s_waitcnt
; GFX9-NEXT: v_mul_f16_e32 v0, 4.0, v0
; GFX9-NEXT: s_setpc_b64
define half @v_test_canonicalize_extract_element_v2f16(<2 x half> %vec) {
%vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0>
%elt = extractelement <2 x half> %vec.op, i32 0
%canonicalized = call half @llvm.canonicalize.f16(half %elt)
ret half %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_v2f16:
; GFX9: v_pk_mul_f16
; GFX9: v_mul_f16_e32
; GFX9-NOT: v_max
; GFX9-NOT: v_pk_max
define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) {
%vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0>
%ins.op = fmul half %val, 8.0
%ins = insertelement <2 x half> %vec.op, half %ins.op, i32 %idx
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
ret <2 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_noncanon_vec_v2f16:
; GFX9: v_mul_f16
; GFX9: v_pk_max_f16 v0, v0, v0
; GFX9-NEXT: s_setpc_b64
define <2 x half> @v_test_canonicalize_insertelement_noncanon_vec_v2f16(<2 x half> %vec, half %val, i32 %idx) {
%ins.op = fmul half %val, 8.0
%ins = insertelement <2 x half> %vec, half %ins.op, i32 %idx
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
ret <2 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_noncanon_insval_v2f16:
; GFX9: v_pk_mul_f16
; GFX9: v_pk_max_f16 v0, v0, v0
; GFX9-NEXT: s_setpc_b64
define <2 x half> @v_test_canonicalize_insertelement_noncanon_insval_v2f16(<2 x half> %vec, half %val, i32 %idx) {
%vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0>
%ins = insertelement <2 x half> %vec.op, half %val, i32 %idx
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
ret <2 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_cvt_pkrtz:
; GCN: s_waitcnt
; GCN-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1
; GCN-NEXT: s_setpc_b64
define <2 x half> @v_test_canonicalize_cvt_pkrtz(float %a, float %b) {
%cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %cvt)
ret <2 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_cubeid:
; GCN: s_waitcnt
; GCN-NEXT: v_cubeid_f32 v0, v0, v1, v2
; GCN-NEXT: s_setpc_b64
define float @v_test_canonicalize_cubeid(float %a, float %b, float %c) {
%cvt = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
%canonicalized = call float @llvm.canonicalize.f32(float %cvt)
ret float %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_frexp_mant:
; GCN: s_waitcnt
; GCN-NEXT: v_frexp_mant_f32_e32 v0, v0
; GCN-NEXT: s_setpc_b64
define float @v_test_canonicalize_frexp_mant(float %a) {
%cvt = call float @llvm.amdgcn.frexp.mant.f32(float %a)
%canonicalized = call float @llvm.canonicalize.f32(float %cvt)
ret float %canonicalized
}
; Avoid failing the test on FreeBSD11.0 which will match the GCN-NOT: 1.0
; in the .amd_amdgpu_isa "amdgcn-unknown-freebsd11.0--gfx802" directive
; CHECK: .amd_amdgpu_isa
declare float @llvm.canonicalize.f32(float) #0
declare float @llvm.copysign.f32(float, float) #0
declare float @llvm.amdgcn.fmul.legacy(float, float) #0
declare float @llvm.amdgcn.fmad.ftz.f32(float, float, float) #0
declare double @llvm.canonicalize.f64(double) #0
declare half @llvm.canonicalize.f16(half) #0
declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
declare i32 @llvm.amdgcn.workitem.id.x() #0
declare float @llvm.sqrt.f32(float) #0
declare float @llvm.ceil.f32(float) #0
declare float @llvm.floor.f32(float) #0
declare float @llvm.fma.f32(float, float, float) #0
declare float @llvm.fmuladd.f32(float, float, float) #0
declare float @llvm.fabs.f32(float) #0
declare float @llvm.sin.f32(float) #0
declare float @llvm.cos.f32(float) #0
declare half @llvm.sin.f16(half) #0
declare half @llvm.cos.f16(half) #0
declare float @llvm.minnum.f32(float, float) #0
declare float @llvm.maxnum.f32(float, float) #0
declare double @llvm.maxnum.f64(double, double) #0
declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #0
declare float @llvm.amdgcn.cubeid(float, float, float) #0
declare float @llvm.amdgcn.frexp.mant.f32(float) #0
attributes #0 = { nounwind readnone }
attributes #1 = { "no-nans-fp-math"="true" }
attributes #2 = { "target-features"="-fp64-fp16-denormals" }