1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00
llvm-mirror/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
Matt Arsenault 33f5f7933c AMDGPU: Change pre-gfx9 implementation of fcanonicalize to mul
If f32 denormals were enabled pre-gfx9, we would still try to
implement this with v_max_f32. Pre-gfx9, these instructions ignored
the denormal mode and did not flush. Switch to the multiply form for
f32 as a workaround which should always work in any case.

This fixes conformance failures when the library implementation of
fmin/fmax were accidentally not inlined, forcing the assumption of no
flushing on targets where denormals are not enabled by default. This
is a workaround, since really we should not be mixing code with
different FP mode expectations, but prefer the lowering that will work
in any mode.

Now this will always use max to implement canonicalize on gfx9+. This
is only really beneficial for f64. For f32/f16 it's a neutral choice
(and worse in terms of code size in 1 case), but possibly worse for
the compiler since it does add an extra register use operand. Leave
this change for later.
2020-04-23 15:24:13 -04:00

748 lines
33 KiB
LLVM

; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GFX89 %s
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
; RUN: llc -march=amdgcn -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
declare half @llvm.fabs.f16(half) #0
declare half @llvm.canonicalize.f16(half) #0
declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0
declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
declare <3 x half> @llvm.canonicalize.v3f16(<3 x half>) #0
declare <4 x half> @llvm.canonicalize.v4f16(<4 x half>) #0
declare i32 @llvm.amdgcn.workitem.id.x() #0
; GCN-LABEL: {{^}}test_fold_canonicalize_undef_value_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_undef_value_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half undef)
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}v_test_canonicalize_var_f16:
; GFX89: v_max_f16_e32 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
; CI: v_cvt_f32_f16_e32
; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
define amdgpu_kernel void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 {
%val = load half, half addrspace(1)* %out
%canonicalized = call half @llvm.canonicalize.f16(half %val)
store half %canonicalized, half addrspace(1)* undef
ret void
}
; GCN-LABEL: {{^}}s_test_canonicalize_var_f16:
; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 {
%val = bitcast i16 %val.arg to half
%canonicalized = call half @llvm.canonicalize.f16(half %val)
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_v2f16:
; GFX9: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
; VI: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI: v_max_f16_e32 v0, v0, v0
; VI: v_or_b32_e32 v0, v0, v1
define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 {
%ins0 = insertelement <2 x half> undef, half %lo, i32 0
%ins1 = insertelement <2 x half> %ins0, half %hi, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1)
ret <2 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f16:
; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], |{{v[0-9]+}}|, |{{v[0-9]+}}|
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %out) #1 {
%val = load half, half addrspace(1)* %out
%val.fabs = call half @llvm.fabs.f16(half %val)
%canonicalized = call half @llvm.canonicalize.f16(half %val.fabs)
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f16:
; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], -|{{v[0-9]+}}|, -|{{v[0-9]+}}|
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #1 {
%val = load half, half addrspace(1)* %out
%val.fabs = call half @llvm.fabs.f16(half %val)
%val.fabs.fneg = fsub half -0.0, %val.fabs
%canonicalized = call half @llvm.canonicalize.f16(half %val.fabs.fneg)
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f16:
; GFX89: v_max_f16_e64 [[REG:v[0-9]+]], -{{v[0-9]+}}, -{{v[0-9]+}}
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
; CI: v_cvt_f32_f16_e64 {{v[0-9]+}}, -{{v[0-9]+}}
; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* %out) #1 {
%val = load half, half addrspace(1)* %out
%val.fneg = fsub half -0.0, %val
%canonicalized = call half @llvm.canonicalize.f16(half %val.fneg)
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}v_test_no_denormals_canonicalize_fneg_var_f16:
; VI: v_mul_f16_e32 [[REG:v[0-9]+]], -1.0, v{{[0-9]+}}
; GFX9: v_max_f16_e64 [[REG:v[0-9]+]], -v{{[0-9]+}}, -v{{[0-9]+}}
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(half addrspace(1)* %out) #2 {
%val = load half, half addrspace(1)* %out
%val.fneg = fsub half -0.0, %val
%canonicalized = call half @llvm.canonicalize.f16(half %val.fneg)
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}v_test_no_denormals_canonicalize_fneg_fabs_var_f16:
; VI: v_mul_f16_e64 [[REG:v[0-9]+]], -1.0, |v{{[0-9]+}}|
; GFX9: v_max_f16_e64 [[REG:v[0-9]+]], -|v{{[0-9]+}}|, -|v{{[0-9]+}}|
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
; CI: v_cvt_f32_f16_e64 {{v[0-9]+}}, -|{{v[0-9]+}}|
; CI: v_mul_f32_e32 {{v[0-9]+}}, 1.0, {{v[0-9]+}}
define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #2 {
%val = load half, half addrspace(1)* %out
%val.fabs = call half @llvm.fabs.f16(half %val)
%val.fabs.fneg = fsub half -0.0, %val.fabs
%canonicalized = call half @llvm.canonicalize.f16(half %val.fabs.fneg)
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half 0.0)
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half -0.0)
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}}
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half 1.0)
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}}
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half -1.0)
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c00{{$}}
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half 16.0)
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal0_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}}
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}}
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #3 {
%canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal1_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}}
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}}
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #3 {
%canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c00{{$}}
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00)
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half))
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half))
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01)
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF)
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF)
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 {
%canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01)
store half %canonicalized, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}v_test_canonicalize_var_v2f16:
; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-DAG: v_max_f16_e32 [[REG1:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
; VI-NOT: v_and_b32
; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+$}}
; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
%val = load <2 x half>, <2 x half> addrspace(1)* %gep
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_v2f16:
; VI: v_max_f16_sdwa [[REG0:v[0-9]+]], |v{{[0-9]+}}|, |v{{[0-9]+}}| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI: v_max_f16_e64 [[REG1:v[0-9]+]], |v{{[0-9]+}}|, |v{{[0-9]+}}|
; VI-NOT: 0xffff
; VI: v_or_b32
; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], [[ABS]], [[ABS]]{{$}}
; GFX89: {{flat|global}}_store_dword
define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
%val = load <2 x half>, <2 x half> addrspace(1)* %gep
%val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_v2f16:
; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], -|v{{[0-9]+}}|, -|v{{[0-9]+}}| dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-DAG: v_max_f16_e64 [[REG1:v[0-9]+]], -|v{{[0-9]+}}|, -|v{{[0-9]+}}|
; VI: v_or_b32
; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], [[ABS]], [[ABS]] neg_lo:[1,1] neg_hi:[1,1]{{$}}
; GFX89: {{flat|global}}_store_dword
; CI: v_cvt_f32_f16
; CI: v_cvt_f32_f16
; CI: v_mul_f32_e32 v{{[0-9]+}}, 1.0
; CI: v_mul_f32_e32 v{{[0-9]+}}, 1.0
define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
%val = load <2 x half>, <2 x half> addrspace(1)* %gep
%val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
%val.fabs.fneg = fsub <2 x half> <half -0.0, half -0.0>, %val.fabs
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs.fneg)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16:
; VI-DAG: v_max_f16_sdwa [[REG1:v[0-9]+]], -v{{[0-9]+}}, -v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-DAG: v_max_f16_e64 [[REG0:v[0-9]+]], -v{{[0-9]+}}, -v{{[0-9]+}}
; VI-NOT: 0xffff
; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} neg_lo:[1,1] neg_hi:[1,1]{{$}}
; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
%val = load <2 x half>, <2 x half> addrspace(1)* %gep
%fneg.val = fsub <2 x half> <half -0.0, half -0.0>, %val
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %fneg.val)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}s_test_canonicalize_var_v2f16:
; VI-DAG: v_max_f16_sdwa [[REG0:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-DAG: v_max_f16_e64 [[REG1:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+}}
; VI-NOT: v_and_b32
; GFX9: v_pk_max_f16 [[REG:v[0-9]+]], {{s[0-9]+}}, {{s[0-9]+$}}
; GFX9: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 zeroext %val.arg) #1 {
%val = bitcast i32 %val.arg to <2 x half>
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_p0_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_n0_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -0.0, half -0.0>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_p1_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}}
; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 1.0, half 1.0>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_n1_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}}
; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -1.0, half -1.0>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_literal_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c004c00{{$}}
; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 16.0, half 16.0>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}}
; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}}
; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #3 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}}
; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}}
; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #3 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c007c00{{$}}
; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C00, half 0xH7C00>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>))
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half bitcast (i16 -2 to half), half bitcast (i16 -2 to half)>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C01, half 0xH7C01>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7DFF, half 0xH7DFF>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFDFF, half 0xHFDFF>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_v2f16:
; GFX89: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFC01, half 0xHFC01>)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
; FIXME: Extra 4th component handled
; GCN-LABEL: {{^}}v_test_canonicalize_var_v3f16:
; GFX9: s_waitcnt
; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
; GFX9-NEXT: s_setpc_b64
; VI-DAG: v_max_f16_sdwa [[CANON_ELT1:v[0-9]+]], v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-DAG: v_max_f16_e32 [[CANON_ELT0:v[0-9]+]], v0, v0
; VI-DAG: v_max_f16_e32 v1, v1, v1
; VI-DAG: v_or_b32_e32 v0, [[CANON_ELT0]], [[CANON_ELT1]]
; VI: s_setpc_b64
define <3 x half> @v_test_canonicalize_var_v3f16(<3 x half> %val) #1 {
%canonicalized = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> %val)
ret <3 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_var_v4f16:
; GFX9: s_waitcnt
; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
; GFX9-NEXT: s_setpc_b64
; VI-DAG: v_max_f16_sdwa [[CANON_ELT3:v[0-9]+]], v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-DAG: v_max_f16_e32 [[CANON_ELT2:v[0-9]+]], v1, v1
; VI-DAG: v_max_f16_sdwa [[CANON_ELT1:v[0-9]+]], v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-DAG: v_max_f16_e32 [[CANON_ELT0:v[0-9]+]], v0, v0
; VI-DAG: v_or_b32_e32 v0, [[CANON_ELT0]], [[CANON_ELT1]]
; VI-DAG: v_or_b32_e32 v1, [[CANON_ELT2]], [[CANON_ELT3]]
; VI: s_setpc_b64
define <4 x half> @v_test_canonicalize_var_v4f16(<4 x half> %val) #1 {
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %val)
ret <4 x half> %canonicalized
}
; GCN-LABEL: {{^}}s_test_canonicalize_undef_v2f16:
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00
; GFX89: {{flat|global}}_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[REG]]
define amdgpu_kernel void @s_test_canonicalize_undef_v2f16(<2 x half> addrspace(1)* %out) #1 {
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> undef)
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_v2f16:
; GFX9: s_waitcnt
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: s_setpc_b64
; High bits known zero
; FIXME: Should also be true on gfx9 by default?
; VI: s_waitcnt
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: s_setpc_b64
define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
%vec = insertelement <2 x half> undef, half %val, i32 0
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_undef_reg_v2f16:
; GFX89: s_waitcnt
; GFX89-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; GFX89-NEXT: s_setpc_b64
define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
%vec = insertelement <2 x half> undef, half %val, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_undef_lo_imm_hi_v2f16:
; GCN: s_waitcnt
; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c003c00
; GFX89-NEXT: s_setpc_b64
; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
; CI-NEXT: v_mov_b32_e32 v1, 1.0
; CI-NEXT: s_setpc_b64
define <2 x half> @v_test_canonicalize_undef_lo_imm_hi_v2f16() #1 {
%vec = insertelement <2 x half> undef, half 1.0, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_imm_lo_undef_hi_v2f16:
; GCN: s_waitcnt
; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c003c00
; GFX89-NEXT: s_setpc_b64
; CI-NEXT: v_mov_b32_e32 v0, 1.0
; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; CI-NEXT: s_setpc_b64
define <2 x half> @v_test_canonicalize_imm_lo_undef_hi_v2f16() #1 {
%vec = insertelement <2 x half> undef, half 1.0, i32 0
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_undef_lo_k_hi_v2f16:
; GCN: s_waitcnt
; GFX89-NEXT: v_mov_b32_e32 v0, 0x4c004c00
; GFX89-NEXT: s_setpc_b64
; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
; CI-NEXT: v_mov_b32_e32 v1, 0x41800000
; CI-NEXT: s_setpc_b64
define <2 x half> @v_test_canonicalize_undef_lo_k_hi_v2f16() #1 {
%vec = insertelement <2 x half> undef, half 16.0, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_k_lo_undef_hi_v2f16:
; GCN: s_waitcnt
; GFX89-NEXT: v_mov_b32_e32 v0, 0x4c004c00
; GFX89-NEXT: s_setpc_b64
; CI-NEXT: v_mov_b32_e32 v0, 0x41800000
; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; CI-NEXT: s_setpc_b64
define <2 x half> @v_test_canonicalize_k_lo_undef_hi_v2f16() #1 {
%vec = insertelement <2 x half> undef, half 16.0, i32 0
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
ret <2 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_reg_k_v2f16:
; GFX9: s_waitcnt
; GFX9-DAG: v_max_f16_e32 v0, v0, v0
; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4000
; GFX9: v_and_b32_e32 v0, 0xffff, v0
; GFX9: v_lshl_or_b32 v0, [[K]], 16, v0
; GFX9: s_setpc_b64
; VI: s_waitcnt
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_or_b32_e32 v0, 2.0, v0
; VI-NEXT: s_setpc_b64
define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 {
%vec0 = insertelement <2 x half> undef, half %val, i32 0
%vec1 = insertelement <2 x half> %vec0, half 2.0, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
ret <2 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_k_reg_v2f16:
; GFX9: v_max_f16_e32 v0, v0, v0
; GFX9: v_mov_b32_e32 [[K:v[0-9]+]], 0x4000
; GFX9: v_lshl_or_b32 v0, v0, 16, [[K]]
; GFX9: s_setpc_b64
; VI: s_waitcnt
; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, 0x4000, v0
; VI-NEXT: s_setpc_b64
define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 {
%vec0 = insertelement <2 x half> undef, half 2.0, i32 0
%vec1 = insertelement <2 x half> %vec0, half %val, i32 1
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec1)
ret <2 x half> %canonicalized
}
; GCN-LABEL: {{^}}s_test_canonicalize_undef_v4f16:
; GCN: v_mov_b32_e32 v0, 0x7e007e00
; GCN: v_mov_b32_e32 v1, v0
define amdgpu_kernel void @s_test_canonicalize_undef_v4f16(<4 x half> addrspace(1)* %out) #1 {
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> undef)
store <4 x half> %canonicalized, <4 x half> addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_undef_undef_v4f16:
; GFX9: s_waitcnt
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_setpc_b64
; VI: s_waitcnt
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0
; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; VI-NEXT: s_setpc_b64
define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1 {
%vec = insertelement <4 x half> undef, half %val, i32 0
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec)
ret <4 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_reg_reg_undef_undef_v4f16:
; GFX9: s_waitcnt
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX9-NEXT: v_lshl_or_b32 v0, v1, 16, v0
; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_setpc_b64
; VI: s_waitcnt
; VI-DAG: v_max_f16_e32 v0, v0, v0
; VI-DAG: v_max_f16_sdwa v1, v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI: v_or_b32_e32 v0, v0, v1
; VI-NEXT: v_mov_b32_e32 v1, 0x7e007e00
; VI-NEXT: s_setpc_b64
define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, half %val1) #1 {
%vec0 = insertelement <4 x half> undef, half %val0, i32 0
%vec1 = insertelement <4 x half> %vec0, half %val1, i32 1
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec1)
ret <4 x half> %canonicalized
}
; GCN-LABEL: {{^}}v_test_canonicalize_reg_undef_reg_reg_v4f16:
; GFX9: s_waitcnt
; GFX9-NEXT: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff
; GFX9-NEXT: v_and_b32_e32 v1, [[MASK]], v1
; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
; GFX9-NEXT: v_lshl_or_b32 v1, v2, 16, v1
; GFX9-NEXT: v_and_b32_e32 v0, [[MASK]], v0
; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
; GFX9-NEXT: s_setpc_b64
; VI: s_waitcnt
; VI-NEXT: v_max_f16_e32 v0, v0, v0
; VI-NEXT: v_max_f16_e32 v1, v1, v1
; VI-NEXT: v_max_f16_sdwa v2, v2, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v0, 0x7e000000, v0
; VI-NEXT: v_or_b32_e32 v1, v1, v2
; VI-NEXT: s_setpc_b64
define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half %val1, half %val2) #1 {
%vec0 = insertelement <4 x half> undef, half %val0, i32 0
%vec1 = insertelement <4 x half> %vec0, half %val1, i32 2
%vec2 = insertelement <4 x half> %vec1, half %val2, i32 3
%canonicalized = call <4 x half> @llvm.canonicalize.v4f16(<4 x half> %vec2)
ret <4 x half> %canonicalized
}
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
attributes #2 = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
attributes #3 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }