mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
4837270633
Summary: Incorrect code was generated when lowering insertelement operations for vectors with 8 or 16 bit elements. The value being inserted was not adjusted for the position of the element within the 32 bit word and so only the low element within each 32 bit word could receive the intended value. Fixed by simply replicating the value to each element of a congruent vector before the mask and or operation used to update the intended element. A number of affected LIT tests have been updated appropriately. before the mask & or into the intended Reviewers: arsenm, nhaehnle Reviewed By: arsenm Subscribers: llvm-commits, arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye Tags: #llvm Differential Revision: https://reviews.llvm.org/D57588 llvm-svn: 352885
913 lines
40 KiB
LLVM
913 lines
40 KiB
LLVM
; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-FLUSH,GCN-FLUSH,GCN-NOEXCEPT %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=-fp32-denormals,+fp-exceptions < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GCN-EXCEPT,VI,VI-FLUSH,GCN-FLUSH %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx801 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,VI-DENORM,GCN-DENORM,GCN-NOEXCEPT %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=+fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-DENORM,GCN-DENORM,GCN-NOEXCEPT %s
|
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX9-FLUSH,GCN-FLUSH,GCN-NOEXCEPT %s
|
|
|
|
; GCN-LABEL: {{^}}test_no_fold_canonicalize_loaded_value_f32:
|
|
; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
|
|
; GFX9-DENORM: v_max_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
|
define amdgpu_kernel void @test_no_fold_canonicalize_loaded_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%v = load float, float addrspace(1)* %gep, align 4
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_value_f32:
|
|
; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
; GCN-NOT: 1.0
|
|
define amdgpu_kernel void @test_fold_canonicalize_fmul_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = fmul float %load, 15.0
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_fold_canonicalize_fmul_legacy_value_f32:
|
|
; GCN: v_mul_legacy_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_fmul_legacy_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = call float @llvm.amdgcn.fmul.legacy(float %load, float 15.0)
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_fold_canonicalize_sub_value_f32:
|
|
; GCN: v_sub_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_sub_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = fsub float 15.0, %load
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_fold_canonicalize_add_value_f32:
|
|
; GCN: v_add_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_add_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = fadd float %load, 15.0
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_fold_canonicalize_sqrt_value_f32:
|
|
; GCN: v_sqrt_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_sqrt_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = call float @llvm.sqrt.f32(float %load)
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_fceil_value_f32:
|
|
; GCN: v_ceil_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_fceil_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = call float @llvm.ceil.f32(float %load)
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_floor_value_f32:
|
|
; GCN: v_floor_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_floor_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = call float @llvm.floor.f32(float %load)
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_fma_value_f32:
|
|
; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x41700000
|
|
; GCN: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SREG]], [[SREG]]
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_fma_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = call float @llvm.fma.f32(float %load, float 15.0, float 15.0)
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_fmad_ftz_value_f32:
|
|
; GCN: s_mov_b32 [[SGPR:s[0-9]+]], 0x41700000
|
|
; GCN: v_mad_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SGPR]], [[SGPR]]
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_fmad_ftz_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = call float @llvm.amdgcn.fmad.ftz.f32(float %load, float 15.0, float 15.0)
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_fmuladd_value_f32:
|
|
; GCN-FLUSH: v_mac_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
|
|
; GCN-DENORM: s_mov_b32 [[SREG:s[0-9]+]], 0x41700000
|
|
; GCN-DENORM: v_fma_f32 [[V:v[0-9]+]], v{{[0-9]+}}, [[SREG]], [[SREG]]
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
; GCN-NOT: 1.0
|
|
define amdgpu_kernel void @test_fold_canonicalize_fmuladd_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = call float @llvm.fmuladd.f32(float %load, float 15.0, float 15.0)
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_canonicalize_value_f32:
|
|
; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]],
|
|
; GCN-FLUSH: v_mul_f32_e32 [[V:v[0-9]+]], 1.0, [[LOAD]]
|
|
; GCN-DENORM: v_max_f32_e32 [[V:v[0-9]+]], [[LOAD]], [[LOAD]]
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_canonicalize_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = call float @llvm.canonicalize.f32(float %load)
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_fpextend_value_f64_f32:
|
|
; GCN: v_cvt_f64_f32_e32 [[V:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f64_f32(float addrspace(1)* %arg, double addrspace(1)* %out) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = fpext float %load to double
|
|
%canonicalized = tail call double @llvm.canonicalize.f64(double %v)
|
|
%gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
|
|
store double %canonicalized, double addrspace(1)* %gep2, align 8
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16:
|
|
; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}}
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16(half addrspace(1)* %arg, float addrspace(1)* %out) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
|
|
%load = load half, half addrspace(1)* %gep, align 2
|
|
%v = fpext half %load to float
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
%gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
|
|
store float %canonicalized, float addrspace(1)* %gep2, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_fpextend_value_f32_f16_flushf16:
|
|
; GCN: v_cvt_f32_f16_e32 [[V:v[0-9]+]], v{{[0-9]+}}
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_fpextend_value_f32_f16_flushf16(half addrspace(1)* %arg, float addrspace(1)* %out) #2 {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
|
|
%load = load half, half addrspace(1)* %gep, align 2
|
|
%v = fpext half %load to float
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
%gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
|
|
store float %canonicalized, float addrspace(1)* %gep2, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_fpround_value_f32_f64:
|
|
; GCN: v_cvt_f32_f64_e32 [[V:v[0-9]+]], v[{{[0-9:]+}}]
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f32_f64(double addrspace(1)* %arg, float addrspace(1)* %out) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
|
|
%load = load double, double addrspace(1)* %gep, align 8
|
|
%v = fptrunc double %load to float
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
%gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
|
|
store float %canonicalized, float addrspace(1)* %gep2, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32:
|
|
; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
|
|
; GCN-NOT: v_max
|
|
; GCN-NOT: v_mul
|
|
; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32(float addrspace(1)* %arg, half addrspace(1)* %out) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = fptrunc float %load to half
|
|
%canonicalized = tail call half @llvm.canonicalize.f16(half %v)
|
|
%gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
|
|
store half %canonicalized, half addrspace(1)* %gep2, align 2
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_fpround_value_f16_f32_flushf16:
|
|
; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
|
|
; GCN-NOT: v_max
|
|
; GCN-NOT: v_mul
|
|
; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_fpround_value_f16_f32_flushf16(float addrspace(1)* %arg, half addrspace(1)* %out) #2 {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = fptrunc float %load to half
|
|
%canonicalized = tail call half @llvm.canonicalize.f16(half %v)
|
|
%gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
|
|
store half %canonicalized, half addrspace(1)* %gep2, align 2
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_fpround_value_v2f16_v2f32:
|
|
; GCN-DAG: v_cvt_f16_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
|
|
; VI-DAG: v_cvt_f16_f32_sdwa [[V1:v[0-9]+]], v{{[0-9]+}}
|
|
; VI: v_or_b32_e32 [[V:v[0-9]+]], [[V0]], [[V1]]
|
|
; GFX9: v_cvt_f16_f32_e32 [[V1:v[0-9]+]], v{{[0-9]+}}
|
|
; GFX9: v_and_b32_e32 [[V0_16:v[0-9]+]], 0xffff, [[V0]]
|
|
; GFX9: v_lshl_or_b32 [[V:v[0-9]+]], [[V1]], 16, [[V0_16]]
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_fpround_value_v2f16_v2f32(<2 x float> addrspace(1)* %arg, <2 x half> addrspace(1)* %out) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds <2 x float>, <2 x float> addrspace(1)* %arg, i32 %id
|
|
%load = load <2 x float>, <2 x float> addrspace(1)* %gep, align 8
|
|
%v = fptrunc <2 x float> %load to <2 x half>
|
|
%canonicalized = tail call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %v)
|
|
%gep2 = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i32 %id
|
|
store <2 x half> %canonicalized, <2 x half> addrspace(1)* %gep2, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_no_fold_canonicalize_fneg_value_f32:
|
|
; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, -1.0, v{{[0-9]+}}
|
|
; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}}
|
|
define amdgpu_kernel void @test_no_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = fsub float -0.0, %load
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_fneg_value_f32:
|
|
; GCN: v_xor_b32_e32 [[V:v[0-9]+]], 0x80000000, v{{[0-9]+}}
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_fneg_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v0 = fadd float %load, 0.0
|
|
%v = fsub float -0.0, %v0
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_no_fold_canonicalize_fabs_value_f32:
|
|
; GCN-FLUSH: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
|
|
; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
|
|
define amdgpu_kernel void @test_no_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = tail call float @llvm.fabs.f32(float %load)
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_no_fold_canonicalize_fcopysign_value_f32:
|
|
; GCN-FLUSH: v_mul_f32_e64 v{{[0-9]+}}, 1.0, |v{{[0-9]+}}|
|
|
; GCN-DENORM: v_max_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
|
|
; GCN-NOT: v_mul_
|
|
; GCN-NOT: v_max_
|
|
define amdgpu_kernel void @test_no_fold_canonicalize_fcopysign_value_f32(float addrspace(1)* %arg, float %sign) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%canon.load = tail call float @llvm.canonicalize.f32(float %load)
|
|
%copysign = call float @llvm.copysign.f32(float %canon.load, float %sign)
|
|
%v = tail call float @llvm.fabs.f32(float %load)
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_fabs_value_f32:
|
|
; GCN: v_and_b32_e32 [[V:v[0-9]+]], 0x7fffffff, v{{[0-9]+}}
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_fabs_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v0 = fadd float %load, 0.0
|
|
%v = tail call float @llvm.fabs.f32(float %v0)
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_sin_value_f32:
|
|
; GCN: v_sin_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_sin_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = tail call float @llvm.sin.f32(float %load)
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_cos_value_f32:
|
|
; GCN: v_cos_f32_e32 [[V:v[0-9]+]], v{{[0-9]+}}
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_cos_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = tail call float @llvm.cos.f32(float %load)
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_sin_value_f16:
|
|
; GCN: v_sin_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
|
|
; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]]
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_sin_value_f16(half addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
|
|
%load = load half, half addrspace(1)* %gep, align 2
|
|
%v = tail call half @llvm.sin.f16(half %load)
|
|
%canonicalized = tail call half @llvm.canonicalize.f16(half %v)
|
|
store half %canonicalized, half addrspace(1)* %gep, align 2
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_cos_value_f16:
|
|
; GCN: v_cos_f32_e32 [[V0:v[0-9]+]], v{{[0-9]+}}
|
|
; GCN: v_cvt_f16_f32_e32 [[V:v[0-9]+]], [[V0]]
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_short v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_cos_value_f16(half addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
|
|
%load = load half, half addrspace(1)* %gep, align 2
|
|
%v = tail call half @llvm.cos.f16(half %load)
|
|
%canonicalized = tail call half @llvm.canonicalize.f16(half %v)
|
|
store half %canonicalized, half addrspace(1)* %gep, align 2
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_qNaN_value_f32:
|
|
; GCN: v_mov_b32_e32 [[V:v[0-9]+]], 0x7fc00000
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_qNaN_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode:
|
|
; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
|
|
; GCN-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
|
|
; GCN-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
|
|
; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, [[QUIET]]
|
|
|
|
; GCN-NOT: v_max
|
|
; GCN-NOT: v_mul
|
|
|
|
; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_ieee_mode(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = tail call float @llvm.minnum.f32(float %load, float 0.0)
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode:
|
|
; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
|
|
; GCN-DENORM-NOT: v_max
|
|
; GCN-DENORM-NOT: v_mul
|
|
|
|
; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
|
|
; GCN-DENORM-NOT: v_max
|
|
; GCN-DENORM-NOT: v_mul
|
|
|
|
; GFX9: {{flat|global}}_store_dword v[{{[0-9:]+}}]
|
|
define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(float addrspace(1)* %arg) #1 {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = tail call float @llvm.minnum.f32(float %load, float 0.0)
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_minnum_value_f32:
|
|
; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_minnum_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v0 = fadd float %load, 0.0
|
|
%v = tail call float @llvm.minnum.f32(float %v0, float 0.0)
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; FIXME: Should there be more checks here? minnum with NaN operand is simplified away.
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_sNaN_value_f32:
|
|
; GCN: {{flat|global}}_load_dword [[LOAD:v[0-9]+]]
|
|
; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, [[LOAD]]
|
|
; GCN-DENORM: v_max_f32_e32 v{{[0-9]+}}, [[LOAD]], [[LOAD]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 2139095041 to float))
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
|
|
; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
|
|
|
|
; GFX9-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
|
|
; GFX9-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[QUIET]]
|
|
|
|
; GFX9-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
|
|
; GFX9-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]
|
|
|
|
|
|
; VI-FLUSH: v_mul_f32_e32 [[QUIET_V0:v[0-9]+]], 1.0, [[VAL]]
|
|
; VI-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_V0]]
|
|
|
|
; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[VAL]]
|
|
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[RESULT]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float))
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode:
|
|
; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
|
|
|
|
; GFX9: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]]
|
|
|
|
; VI-FLUSH: v_mul_f32_e32 [[QUIET:v[0-9]+]], 1.0, [[VAL]]
|
|
; VI-FLUSH: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]
|
|
|
|
; VI-DENORM: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[VAL]]
|
|
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[RESULT]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v = tail call float @llvm.maxnum.f32(float %load, float 0.0)
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_maxnum_value_f32:
|
|
; GCN: v_max_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
|
|
; GCN-NOT: v_max
|
|
; GCN-NOT: v_mul
|
|
; GCN: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load = load float, float addrspace(1)* %gep, align 4
|
|
%v0 = fadd float %load, 0.0
|
|
%v = tail call float @llvm.maxnum.f32(float %v0, float 0.0)
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_maxnum_value_f64:
|
|
; GCN: v_max_f64 [[V:v\[[0-9]+:[0-9]+\]]], v[{{[0-9:]+}}], 0
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_maxnum_value_f64(double addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
|
|
%load = load double, double addrspace(1)* %gep, align 8
|
|
%v0 = fadd double %load, 0.0
|
|
%v = tail call double @llvm.maxnum.f64(double %v0, double 0.0)
|
|
%canonicalized = tail call double @llvm.canonicalize.f64(double %v)
|
|
store double %canonicalized, double addrspace(1)* %gep, align 8
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_fmul_value_f32_no_ieee:
|
|
; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN-NEXT: ; return
|
|
define amdgpu_ps float @test_fold_canonicalize_fmul_value_f32_no_ieee(float %arg) {
|
|
entry:
|
|
%v = fmul float %arg, 15.0
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
ret float %canonicalized
|
|
}
|
|
|
|
; GCN-LABEL: test_fold_canonicalize_fmul_nnan_value_f32_no_ieee:
|
|
; GCN: v_mul_f32_e32 [[V:v[0-9]+]], 0x41700000, v{{[0-9]+}}
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN-NEXT: ; return
|
|
define amdgpu_ps float @test_fold_canonicalize_fmul_nnan_value_f32_no_ieee(float %arg) {
|
|
entry:
|
|
%v = fmul nnan float %arg, 15.0
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
ret float %canonicalized
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_fold_canonicalize_fdiv_value_f32_no_ieee:
|
|
; GCN: v_div_fixup_f32
|
|
; GCN-NOT: v_max
|
|
; GCN-NOT: v_mul
|
|
; GCN: ; return
|
|
define amdgpu_ps float @test_fold_canonicalize_fdiv_value_f32_no_ieee(float %arg0) {
|
|
entry:
|
|
%v = fdiv float 15.0, %arg0
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
ret float %canonicalized
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f32
|
|
; GFX9-DENORM: global_load_dword [[V:v[0-9]+]],
|
|
; GFX9-DENORM: global_store_dword v[{{[0-9:]+}}], [[V]]
|
|
; GFX9-DENORM-NOT: 1.0
|
|
; GCN-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
|
|
define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f32(float addrspace(1)* %arg, float addrspace(1)* %out) #1 {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%v = load float, float addrspace(1)* %gep, align 4
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
%gep2 = getelementptr inbounds float, float addrspace(1)* %out, i32 %id
|
|
store float %canonicalized, float addrspace(1)* %gep2, align 4
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f64
|
|
; GCN: {{flat|global}}_load_dwordx2 [[V:v\[[0-9:]+\]]],
|
|
; GCN: {{flat|global}}_store_dwordx2 v[{{[0-9:]+}}], [[V]]
|
|
; GCN-NOT: v_mul_
|
|
; GCN-NOT: v_max_
|
|
define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(double addrspace(1)* %arg, double addrspace(1)* %out) #1 {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds double, double addrspace(1)* %arg, i32 %id
|
|
%v = load double, double addrspace(1)* %gep, align 8
|
|
%canonicalized = tail call double @llvm.canonicalize.f64(double %v)
|
|
%gep2 = getelementptr inbounds double, double addrspace(1)* %out, i32 %id
|
|
store double %canonicalized, double addrspace(1)* %gep2, align 8
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16
|
|
; GCN: {{flat|global}}_load_ushort [[V:v[0-9]+]],
|
|
; GCN-NOT: v_mul
|
|
; GCN-NOT: v_max
|
|
; GCN: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[V]]
|
|
define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(half addrspace(1)* %arg, half addrspace(1)* %out) #1 {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds half, half addrspace(1)* %arg, i32 %id
|
|
%v = load half, half addrspace(1)* %gep, align 2
|
|
%canonicalized = tail call half @llvm.canonicalize.f16(half %v)
|
|
%gep2 = getelementptr inbounds half, half addrspace(1)* %out, i32 %id
|
|
store half %canonicalized, half addrspace(1)* %gep2, align 2
|
|
ret void
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_fold_canonicalize_select_value_f32:
|
|
; GCN: v_add_f32
|
|
; GCN: v_add_f32
|
|
; GCN: v_cndmask_b32
|
|
; GCN-NOT: v_mul_
|
|
; GCN-NOT: v_max_
|
|
define amdgpu_kernel void @test_fold_canonicalize_select_value_f32(float addrspace(1)* %arg) {
|
|
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
|
|
%gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %id
|
|
%load0 = load volatile float, float addrspace(1)* %gep, align 4
|
|
%load1 = load volatile float, float addrspace(1)* %gep, align 4
|
|
%load2 = load volatile i32, i32 addrspace(1)* undef, align 4
|
|
%v0 = fadd float %load0, 15.0
|
|
%v1 = fadd float %load1, 32.0
|
|
%cond = icmp eq i32 %load2, 0
|
|
%select = select i1 %cond, float %v0, float %v1
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %select)
|
|
store float %canonicalized, float addrspace(1)* %gep, align 4
|
|
ret void
|
|
}
|
|
|
|
; Need to quiet the nan with a separate instruction since it will be
|
|
; passed through the minnum.
|
|
; FIXME: canonicalize doens't work correctly without ieee_mode
|
|
|
|
; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode:
|
|
; GFX9-NOT: v0
|
|
; GFX9-NOT: v1
|
|
; GFX9: v_min_f32_e32 v0, v0, v1
|
|
; GFX9-NEXT: ; return to shader
|
|
|
|
; VI-FLUSH: v_min_f32_e32 v0, v0, v1
|
|
; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
|
|
; VI-FLUSH-NEXT: ; return
|
|
|
|
; VI-DENORM-NOT: v0
|
|
; VI-DENORM: v_min_f32_e32 v0, v0, v1
|
|
; VI-DENORM-NEXT: ; return
|
|
define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode(float %arg0, float %arg1) {
|
|
%v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
ret float %canonicalized
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_ieee_mode:
|
|
; GFX9: v_min_f32_e32 v0, v0, v1
|
|
; GFX9-NEXT: s_setpc_b64
|
|
|
|
; VI-FLUSH-DAG: v_mul_f32_e32 v0, 1.0, v0
|
|
; VI-FLUSH-DAG: v_mul_f32_e32 v1, 1.0, v1
|
|
; VI-FLUSH: v_min_f32_e32 v0, v0, v1
|
|
|
|
; VI-DENORM-DAG: v_max_f32_e32 v0, v0, v0
|
|
; VI-DENORM-DAG: v_max_f32_e32 v1, v1, v1
|
|
; VI-DENORM: v_min_f32_e32 v0, v0, v1
|
|
|
|
; VI-NEXT: s_setpc_b64
|
|
define float @test_fold_canonicalize_minnum_value_ieee_mode(float %arg0, float %arg1) {
|
|
%v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
ret float %canonicalized
|
|
}
|
|
|
|
; Canonicalizing flush necessary pre-gfx9
|
|
; GCN-LABEL: {{^}}test_fold_canonicalize_minnum_value_no_ieee_mode_nnan:
|
|
; GCN: v_min_f32_e32 v0, v0, v1
|
|
; VI-FLUSH-NEXT: v_mul_f32_e32 v0, 1.0, v0
|
|
; GCN-NEXT: ; return
|
|
define amdgpu_ps float @test_fold_canonicalize_minnum_value_no_ieee_mode_nnan(float %arg0, float %arg1) #1 {
|
|
%v = tail call float @llvm.minnum.f32(float %arg0, float %arg1)
|
|
%canonicalized = tail call float @llvm.canonicalize.f32(float %v)
|
|
ret float %canonicalized
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_v2f16:
|
|
; GFX9-DAG: v_add_f16_e32
|
|
; GFX9-DAG: v_mul_f16_e32
|
|
; GFX9-NOT: v_max
|
|
; GFX9-NOT: v_pk_max
|
|
define <2 x half> @v_test_canonicalize_build_vector_v2f16(<2 x half> %vec) {
|
|
%lo = extractelement <2 x half> %vec, i32 0
|
|
%hi = extractelement <2 x half> %vec, i32 1
|
|
%lo.op = fadd half %lo, 1.0
|
|
%hi.op = fmul half %lo, 4.0
|
|
%ins0 = insertelement <2 x half> undef, half %lo.op, i32 0
|
|
%ins1 = insertelement <2 x half> %ins0, half %hi.op, i32 1
|
|
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins1)
|
|
ret <2 x half> %canonicalized
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_noncanon1_v2f16:
|
|
; GFX9: v_add_f16_e32
|
|
; GFX9: v_pk_max
|
|
define <2 x half> @v_test_canonicalize_build_vector_noncanon1_v2f16(<2 x half> %vec) {
|
|
%lo = extractelement <2 x half> %vec, i32 0
|
|
%lo.op = fadd half %lo, 1.0
|
|
%ins = insertelement <2 x half> %vec, half %lo.op, i32 0
|
|
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
|
|
ret <2 x half> %canonicalized
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}v_test_canonicalize_build_vector_noncanon0_v2f16:
|
|
; GFX9: v_add_f16_sdwa
|
|
; GFX9: v_pk_max
|
|
define <2 x half> @v_test_canonicalize_build_vector_noncanon0_v2f16(<2 x half> %vec) {
|
|
%hi = extractelement <2 x half> %vec, i32 1
|
|
%hi.op = fadd half %hi, 1.0
|
|
%ins = insertelement <2 x half> %vec, half %hi.op, i32 1
|
|
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
|
|
ret <2 x half> %canonicalized
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}v_test_canonicalize_extract_element_v2f16:
|
|
; GFX9: s_waitcnt
|
|
; GFX9-NEXT: v_mul_f16_e32 v0, 4.0, v0
|
|
; GFX9-NEXT: s_setpc_b64
|
|
define half @v_test_canonicalize_extract_element_v2f16(<2 x half> %vec) {
|
|
%vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0>
|
|
%elt = extractelement <2 x half> %vec.op, i32 0
|
|
%canonicalized = call half @llvm.canonicalize.f16(half %elt)
|
|
ret half %canonicalized
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_v2f16:
|
|
; GFX9: v_mul_f16_e32
|
|
; GFX9: v_pk_mul_f16
|
|
; GFX9-NOT: v_max
|
|
; GFX9-NOT: v_pk_max
|
|
define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) {
|
|
%vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0>
|
|
%ins.op = fmul half %val, 8.0
|
|
%ins = insertelement <2 x half> %vec.op, half %ins.op, i32 %idx
|
|
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
|
|
ret <2 x half> %canonicalized
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_noncanon_vec_v2f16:
|
|
; GFX9: v_mul_f16
|
|
; GFX9: v_pk_max_f16 v0, v0, v0
|
|
; GFX9-NEXT: s_setpc_b64
|
|
define <2 x half> @v_test_canonicalize_insertelement_noncanon_vec_v2f16(<2 x half> %vec, half %val, i32 %idx) {
|
|
%ins.op = fmul half %val, 8.0
|
|
%ins = insertelement <2 x half> %vec, half %ins.op, i32 %idx
|
|
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
|
|
ret <2 x half> %canonicalized
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_noncanon_insval_v2f16:
|
|
; GFX9: v_pk_mul_f16
|
|
; GFX9: v_pk_max_f16 v0, v0, v0
|
|
; GFX9-NEXT: s_setpc_b64
|
|
define <2 x half> @v_test_canonicalize_insertelement_noncanon_insval_v2f16(<2 x half> %vec, half %val, i32 %idx) {
|
|
%vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0>
|
|
%ins = insertelement <2 x half> %vec.op, half %val, i32 %idx
|
|
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
|
|
ret <2 x half> %canonicalized
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}v_test_canonicalize_cvt_pkrtz:
|
|
; GCN: s_waitcnt
|
|
; GCN-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1
|
|
; GCN-NEXT: s_setpc_b64
|
|
define <2 x half> @v_test_canonicalize_cvt_pkrtz(float %a, float %b) {
|
|
%cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
|
|
%canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %cvt)
|
|
ret <2 x half> %canonicalized
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}v_test_canonicalize_cubeid:
|
|
; GCN: s_waitcnt
|
|
; GCN-NEXT: v_cubeid_f32 v0, v0, v1, v2
|
|
; GCN-NEXT: s_setpc_b64
|
|
define float @v_test_canonicalize_cubeid(float %a, float %b, float %c) {
|
|
%cvt = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
|
|
%canonicalized = call float @llvm.canonicalize.f32(float %cvt)
|
|
ret float %canonicalized
|
|
}
|
|
|
|
; GCN-LABEL: {{^}}v_test_canonicalize_frexp_mant:
|
|
; GCN: s_waitcnt
|
|
; GCN-NEXT: v_frexp_mant_f32_e32 v0, v0
|
|
; GCN-NEXT: s_setpc_b64
|
|
define float @v_test_canonicalize_frexp_mant(float %a) {
|
|
%cvt = call float @llvm.amdgcn.frexp.mant.f32(float %a)
|
|
%canonicalized = call float @llvm.canonicalize.f32(float %cvt)
|
|
ret float %canonicalized
|
|
}
|
|
|
|
; Avoid failing the test on FreeBSD11.0 which will match the GCN-NOT: 1.0
|
|
; in the .amd_amdgpu_isa "amdgcn-unknown-freebsd11.0--gfx802" directive
|
|
; CHECK: .amd_amdgpu_isa
|
|
|
|
declare float @llvm.canonicalize.f32(float) #0
|
|
declare float @llvm.copysign.f32(float, float) #0
|
|
declare float @llvm.amdgcn.fmul.legacy(float, float) #0
|
|
declare float @llvm.amdgcn.fmad.ftz.f32(float, float, float) #0
|
|
declare double @llvm.canonicalize.f64(double) #0
|
|
declare half @llvm.canonicalize.f16(half) #0
|
|
declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
|
|
declare i32 @llvm.amdgcn.workitem.id.x() #0
|
|
declare float @llvm.sqrt.f32(float) #0
|
|
declare float @llvm.ceil.f32(float) #0
|
|
declare float @llvm.floor.f32(float) #0
|
|
declare float @llvm.fma.f32(float, float, float) #0
|
|
declare float @llvm.fmuladd.f32(float, float, float) #0
|
|
declare float @llvm.fabs.f32(float) #0
|
|
declare float @llvm.sin.f32(float) #0
|
|
declare float @llvm.cos.f32(float) #0
|
|
declare half @llvm.sin.f16(half) #0
|
|
declare half @llvm.cos.f16(half) #0
|
|
declare float @llvm.minnum.f32(float, float) #0
|
|
declare float @llvm.maxnum.f32(float, float) #0
|
|
declare double @llvm.maxnum.f64(double, double) #0
|
|
declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #0
|
|
declare float @llvm.amdgcn.cubeid(float, float, float) #0
|
|
declare float @llvm.amdgcn.frexp.mant.f32(float) #0
|
|
|
|
attributes #0 = { nounwind readnone }
|
|
attributes #1 = { "no-nans-fp-math"="true" }
|
|
attributes #2 = { "target-features"="-fp64-fp16-denormals" }
|