mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-10-19 11:02:59 +02:00
AMDGPU: Remove unnecessary ands when f16 is legal
Add a new node to act as a fancy bitcast from f16 operations to i32 that implicitly zero the high 16-bits of the result. Alternatively could try making v2f16 legal and canonicalizing on build_vectors. llvm-svn: 299246
This commit is contained in:
parent
3a1dc32be9
commit
c3c5eef5bb
@ -3491,6 +3491,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
NODE_NAME_CASE(CVT_F32_UBYTE3)
|
||||
NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
|
||||
NODE_NAME_CASE(FP_TO_FP16)
|
||||
NODE_NAME_CASE(FP16_ZEXT)
|
||||
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
|
||||
NODE_NAME_CASE(CONST_DATA_PTR)
|
||||
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
|
||||
@ -3587,7 +3588,10 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
|
||||
|
||||
break;
|
||||
}
|
||||
case AMDGPUISD::FP_TO_FP16: {
|
||||
case AMDGPUISD::FP_TO_FP16:
|
||||
case AMDGPUISD::FP16_ZEXT: {
|
||||
unsigned BitWidth = KnownZero.getBitWidth();
|
||||
|
||||
// High bits are zero.
|
||||
KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
|
||||
break;
|
||||
@ -3621,7 +3625,9 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
|
||||
case AMDGPUISD::CARRY:
|
||||
case AMDGPUISD::BORROW:
|
||||
return 31;
|
||||
|
||||
case AMDGPUISD::FP_TO_FP16:
|
||||
case AMDGPUISD::FP16_ZEXT:
|
||||
return 16;
|
||||
default:
|
||||
return 1;
|
||||
}
|
||||
|
@ -353,6 +353,9 @@ enum NodeType : unsigned {
|
||||
// are known 0.
|
||||
FP_TO_FP16,
|
||||
|
||||
// Wrapper around fp16 results that are known to zero the high bits.
|
||||
FP16_ZEXT,
|
||||
|
||||
/// This node is for VLIW targets and it is used to represent a vector
|
||||
/// that is stored in consecutive registers with the same channel.
|
||||
/// For example:
|
||||
|
@ -112,6 +112,7 @@ def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
|
||||
|
||||
def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
|
||||
def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
|
||||
def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
|
||||
|
||||
|
||||
def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
|
||||
|
@ -475,6 +475,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
||||
setTargetDAGCombine(ISD::UINT_TO_FP);
|
||||
setTargetDAGCombine(ISD::FCANONICALIZE);
|
||||
setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
|
||||
setTargetDAGCombine(ISD::ZERO_EXTEND);
|
||||
|
||||
// All memory operations. Some folding on the pointer operand is done to help
|
||||
// matching the constant offsets in the addressing modes.
|
||||
@ -4058,6 +4059,42 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
static bool fp16SrcZerosHighBits(unsigned Opc) {
|
||||
switch (Opc) {
|
||||
case ISD::SELECT:
|
||||
case ISD::EXTRACT_VECTOR_ELT:
|
||||
return false;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
|
||||
DAGCombinerInfo &DCI) const {
|
||||
if (!Subtarget->has16BitInsts() ||
|
||||
DCI.getDAGCombineLevel() < AfterLegalizeDAG)
|
||||
return SDValue();
|
||||
|
||||
EVT VT = N->getValueType(0);
|
||||
if (VT != MVT::i32)
|
||||
return SDValue();
|
||||
|
||||
SDValue Src = N->getOperand(0);
|
||||
if (Src.getValueType() != MVT::i16)
|
||||
return SDValue();
|
||||
|
||||
// (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
|
||||
// FIXME: It is not universally true that the high bits are zeroed on gfx9.
|
||||
if (Src.getOpcode() == ISD::BITCAST) {
|
||||
SDValue BCSrc = Src.getOperand(0);
|
||||
if (BCSrc.getValueType() == MVT::f16 &&
|
||||
fp16SrcZerosHighBits(BCSrc.getOpcode()))
|
||||
return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::performClassCombine(SDNode *N,
|
||||
DAGCombinerInfo &DCI) const {
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
@ -4594,6 +4631,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
|
||||
return performOrCombine(N, DCI);
|
||||
case ISD::XOR:
|
||||
return performXorCombine(N, DCI);
|
||||
case ISD::ZERO_EXTEND:
|
||||
return performZeroExtendCombine(N, DCI);
|
||||
case AMDGPUISD::FP_CLASS:
|
||||
return performClassCombine(N, DCI);
|
||||
case ISD::FCANONICALIZE:
|
||||
|
@ -81,6 +81,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
|
||||
SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
|
||||
|
@ -1076,6 +1076,11 @@ def : Pat <
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Miscellaneous Patterns
|
||||
//===----------------------------------------------------------------------===//
|
||||
def : Pat <
|
||||
(i32 (AMDGPUfp16_zext f16:$src)),
|
||||
(COPY $src)
|
||||
>;
|
||||
|
||||
|
||||
def : Pat <
|
||||
(i32 (trunc i64:$a)),
|
||||
|
@ -40,9 +40,8 @@ define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) {
|
||||
; VI: flat_load_ushort [[HI:v[0-9]+]]
|
||||
; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
|
||||
; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[MASK]], [[LO]]
|
||||
; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[MASK]], [[HI]]
|
||||
; VI-DAG: v_and_b32_e32 [[FABS_HI:v[0-9]+]], [[MASK]], [[HI]]
|
||||
; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
|
||||
; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff,
|
||||
; VI: v_or_b32
|
||||
; VI: flat_store_dword
|
||||
|
||||
|
@ -77,9 +77,9 @@ entry:
|
||||
|
||||
; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||
; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
|
||||
; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
|
||||
; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
@ -105,9 +105,9 @@ entry:
|
||||
; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
|
||||
; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
|
||||
; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
|
||||
; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
@ -130,9 +130,9 @@ entry:
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 2.0, v[[A_V2_F16]]
|
||||
; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 1.0, v[[A_F16_1]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 2.0, v[[A_V2_F16]]
|
||||
; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 1.0, v[[A_F16_1]]
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
|
@ -206,8 +206,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace
|
||||
|
||||
; GCN-LABEL: {{^}}v_test_canonicalize_var_v2f16:
|
||||
; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, {{v[0-9]+}}
|
||||
; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, {{v[0-9]+}}
|
||||
; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
|
||||
; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, {{v[0-9]+}}
|
||||
; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
|
||||
; VI-NOT: v_and_b32
|
||||
|
||||
; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}}
|
||||
; GFX9: buffer_store_dword [[REG]]
|
||||
@ -220,10 +221,11 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)
|
||||
|
||||
; FIXME: Fold modifier
|
||||
; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_v2f16:
|
||||
; VI: v_bfe_u32
|
||||
; VI: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, v{{[0-9]+}}
|
||||
; VI-DAG: v_bfe_u32
|
||||
; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, v{{[0-9]+}}
|
||||
; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}}
|
||||
; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}}
|
||||
; VI-NOT: 0xffff
|
||||
; VI: v_or_b32
|
||||
|
||||
; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
|
||||
@ -258,10 +260,11 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> ad
|
||||
; FIXME: Fold modifier
|
||||
; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16:
|
||||
; VI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}}
|
||||
; VI: v_lshrrev_b32_e32 [[FNEG_HI:v[0-9]+]], 16, [[FNEG]]
|
||||
; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, [[FNEG]]
|
||||
; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, [[FNEG_HI]]
|
||||
; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
|
||||
; VI-DAG: v_lshrrev_b32_e32 [[FNEG_HI:v[0-9]+]], 16, [[FNEG]]
|
||||
; VI-DAG: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, [[FNEG]]
|
||||
; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, [[FNEG_HI]]
|
||||
; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
|
||||
; VI-NOT: 0xffff
|
||||
|
||||
; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}}
|
||||
; GFX9: buffer_store_dword [[REG]]
|
||||
@ -275,8 +278,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspa
|
||||
|
||||
; GCN-LABEL: {{^}}s_test_canonicalize_var_v2f16:
|
||||
; VI: v_mul_f16_e64 [[REG0:v[0-9]+]], 1.0, {{s[0-9]+}}
|
||||
; VI: v_mul_f16_e64 [[REG1:v[0-9]+]], 1.0, {{s[0-9]+}}
|
||||
; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
|
||||
; VI-DAG: v_mul_f16_e64 [[REG1:v[0-9]+]], 1.0, {{s[0-9]+}}
|
||||
; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
|
||||
; VI-NOT: v_and_b32
|
||||
|
||||
; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{s[0-9]+$}}
|
||||
; GFX9: buffer_store_dword [[REG]]
|
||||
|
@ -77,9 +77,9 @@ entry:
|
||||
|
||||
; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||
; VI: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
|
||||
; VI: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
|
||||
; VI-DAG: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
@ -104,9 +104,9 @@ entry:
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
|
||||
; VI: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
|
||||
; VI-DAG: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
@ -129,9 +129,9 @@ entry:
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
|
||||
; VI: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; VI-DAG: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
|
||||
; VI-DAG: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
|
@ -123,7 +123,7 @@ entry:
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
|
||||
half addrspace(1)* %r,
|
||||
float addrspace(1)* %a) {
|
||||
float addrspace(1)* %a) #0 {
|
||||
entry:
|
||||
%a.val = load float, float addrspace(1)* %a
|
||||
%a.fabs = call float @llvm.fabs.f32(float %a.val)
|
||||
@ -133,6 +133,59 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fptrunc_f32_to_f16_zext_i32:
|
||||
; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
|
||||
; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
|
||||
; GCN-NOT: v[[R_F16]]
|
||||
; GCN: buffer_store_dword v[[R_F16]]
|
||||
define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
|
||||
i32 addrspace(1)* %r,
|
||||
float addrspace(1)* %a) #0 {
|
||||
entry:
|
||||
%a.val = load float, float addrspace(1)* %a
|
||||
%r.val = fptrunc float %a.val to half
|
||||
%r.i16 = bitcast half %r.val to i16
|
||||
%zext = zext i16 %r.i16 to i32
|
||||
store i32 %zext, i32 addrspace(1)* %r
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fptrunc_fabs_f32_to_f16_zext_i32:
|
||||
; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
|
||||
; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]|
|
||||
; GCN-NOT: v[[R_F16]]
|
||||
; GCN: buffer_store_dword v[[R_F16]]
|
||||
define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
|
||||
i32 addrspace(1)* %r,
|
||||
float addrspace(1)* %a) #0 {
|
||||
entry:
|
||||
%a.val = load float, float addrspace(1)* %a
|
||||
%a.fabs = call float @llvm.fabs.f32(float %a.val)
|
||||
%r.val = fptrunc float %a.fabs to half
|
||||
%r.i16 = bitcast half %r.val to i16
|
||||
%zext = zext i16 %r.i16 to i32
|
||||
store i32 %zext, i32 addrspace(1)* %r
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fptrunc_f32_to_f16_sext_i32:
|
||||
; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
|
||||
; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
|
||||
; GCN: v_bfe_i32 v[[R_F16_SEXT:[0-9]+]], v[[R_F16]], 0, 16
|
||||
; GCN: buffer_store_dword v[[R_F16_SEXT]]
|
||||
define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32(
|
||||
i32 addrspace(1)* %r,
|
||||
float addrspace(1)* %a) #0 {
|
||||
entry:
|
||||
%a.val = load float, float addrspace(1)* %a
|
||||
%r.val = fptrunc float %a.val to half
|
||||
%r.i16 = bitcast half %r.val to i16
|
||||
%zext = sext i16 %r.i16 to i32
|
||||
store i32 %zext, i32 addrspace(1)* %r
|
||||
ret void
|
||||
}
|
||||
|
||||
declare float @llvm.fabs.f32(float) #1
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
||||
|
@ -80,9 +80,9 @@ entry:
|
||||
; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||
|
||||
; VI-DAG: v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
|
||||
; VI: v_subrev_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
|
||||
; VI-DAG: v_subrev_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
|
||||
|
||||
; SIVI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
|
||||
; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
|
||||
@ -112,10 +112,10 @@ entry:
|
||||
; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
|
||||
; VI: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
|
||||
; VI: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
|
||||
; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
|
||||
; VI-DAG: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
|
||||
|
||||
; SIVI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
|
||||
; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40003c00
|
||||
@ -143,10 +143,10 @@ entry:
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]]
|
||||
; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], -1.0, v[[A_F16_1]]
|
||||
; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]]
|
||||
; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], -1.0, v[[A_F16_1]]
|
||||
|
||||
; SIVI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
|
||||
; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00c000
|
||||
|
@ -31,9 +31,10 @@ entry:
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI: v_ceil_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_ceil_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; VI: v_ceil_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; VI-DAG: v_ceil_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; VI-DAG: v_ceil_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-NOT: and
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
|
@ -33,9 +33,9 @@ entry:
|
||||
; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]]
|
||||
|
||||
|
||||
; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; VI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||
; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||
; VI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]]
|
||||
; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
|
||||
|
||||
@ -44,10 +44,9 @@ entry:
|
||||
; GCN-DAG: v_cos_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
|
||||
; GCN-DAG: v_cos_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
|
||||
|
||||
; FIXME: Remove and for VI+
|
||||
; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_and_b32_e32 v[[R_F16_0]], 0xffff, v[[R_F16_0]]
|
||||
; GCN-NOT: and
|
||||
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
|
@ -31,9 +31,10 @@ entry:
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI: v_exp_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_exp_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; VI: v_exp_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; VI-DAG: v_exp_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; VI-DAG: v_exp_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-NOT: and
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
|
@ -31,9 +31,10 @@ entry:
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI: v_floor_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_floor_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; VI: v_floor_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; VI-DAG: v_floor_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; VI-DAG: v_floor_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-NOT: and
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
|
@ -123,10 +123,11 @@ define amdgpu_kernel void @fma_f16_imm_c(
|
||||
; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||
; VI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
||||
; VI: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
|
||||
; VI: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
|
||||
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-NOT: and
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
@ -144,12 +145,17 @@ define amdgpu_kernel void @fma_v2f16(
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fma_v2f16_imm_a:
|
||||
; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||
; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||
; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||
|
||||
; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||
; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||
|
||||
; SI: v_mov_b32_e32 v[[A_F32:[0-9]+]], 0x40400000{{$}}
|
||||
; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}}
|
||||
; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||
; GCN-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
||||
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
|
||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
||||
@ -159,9 +165,12 @@ define amdgpu_kernel void @fma_v2f16(
|
||||
; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]]
|
||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
|
||||
; VI: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_F16]], v[[C_V2_F16]]
|
||||
; VI: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16]], v[[C_F16_1]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; VI: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_F16]], v[[C_V2_F16]]
|
||||
|
||||
|
||||
; GCN-NOT: and
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
@ -177,8 +186,12 @@ define amdgpu_kernel void @fma_v2f16_imm_a(
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fma_v2f16_imm_b:
|
||||
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||
; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||
; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||
|
||||
; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||
; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||
|
||||
; SI: v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}}
|
||||
; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}}
|
||||
|
||||
@ -194,12 +207,14 @@ define amdgpu_kernel void @fma_v2f16_imm_a(
|
||||
; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32]], v[[C_F32_1]]
|
||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
|
||||
; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; VI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
||||
; VI: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_F16]], v[[C_V2_F16]]
|
||||
; VI: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16]], v[[C_F16_1]]
|
||||
; VI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
||||
; VI_DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16]], v[[C_F16_1]]
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_F16]], v[[C_V2_F16]]
|
||||
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-NOT: and
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
@ -215,8 +230,12 @@ define amdgpu_kernel void @fma_v2f16_imm_b(
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}fma_v2f16_imm_c:
|
||||
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||
; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||
|
||||
; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||
; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||
|
||||
; SI: v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}}
|
||||
; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}}
|
||||
|
||||
@ -232,12 +251,14 @@ define amdgpu_kernel void @fma_v2f16_imm_b(
|
||||
; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]]
|
||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
|
||||
; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||
; VI: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]]
|
||||
; VI: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]]
|
||||
; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]]
|
||||
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]]
|
||||
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
|
||||
; GCN-NOT: and
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
|
@ -115,24 +115,25 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
|
||||
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
|
||||
|
||||
; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||
; VI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
||||
|
||||
|
||||
; FIXME: and should be unnecessary
|
||||
; VI-FLUSH: v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]]
|
||||
; VI-FLUSH: v_mac_f16_e32 v[[C_F16_1]], v[[B_F16_1]], v[[A_F16_1]]
|
||||
; VI-FLUSH: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[C_V2_F16]]
|
||||
; VI-FLUSH: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]]
|
||||
; VI-FLUSH: v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]]
|
||||
; VI-FLUSH-NOT: v_and_b32
|
||||
; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[C_V2_F16]]
|
||||
|
||||
; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
|
||||
; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
|
||||
; VI-DENORM: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[RES0]]
|
||||
; VI-DENORM: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]]
|
||||
; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]]
|
||||
; VI-DENORM-NOT: v_and_b32
|
||||
; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[RES0]]
|
||||
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @fmuladd_v2f16(
|
||||
|
@ -31,9 +31,10 @@ entry:
|
||||
; SI-DAG: v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
|
||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; VI: v_log_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; VI-DAG: v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; VI-DAG: v_log_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-NOT: and
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
|
@ -80,10 +80,11 @@ entry:
|
||||
|
||||
; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||
; VI: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
|
||||
; VI: v_max_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
|
||||
; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
|
||||
; VI-DAG: v_max_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
|
||||
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-NOT: and
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
@ -108,10 +109,11 @@ entry:
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
|
||||
; VI: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
|
||||
; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
|
||||
; VI-DAG: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
|
||||
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-NOT: and
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
@ -134,10 +136,11 @@ entry:
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
|
||||
; VI: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
|
||||
; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
|
||||
; VI-DAG: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
|
||||
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-NOT: and
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
|
@ -79,10 +79,11 @@ entry:
|
||||
|
||||
; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||
; VI: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
|
||||
; VI: v_min_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
|
||||
; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
|
||||
; VI-DAG: v_min_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
|
||||
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-NOT: and
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
@ -110,10 +111,11 @@ entry:
|
||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
|
||||
; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||
; VI: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
|
||||
; VI: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
|
||||
; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
|
||||
; VI-DAG: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
|
||||
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-NOT: and
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
@ -136,10 +138,11 @@ entry:
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
|
||||
; VI: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
|
||||
; VI_DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
|
||||
; VI-DAG: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
|
||||
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-NOT: and
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
|
@ -33,12 +33,11 @@ entry:
|
||||
; SI: v_rndne_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
|
||||
; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; VI: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; VI: v_rndne_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
|
||||
|
||||
; VI: v_and_b32_e32 v[[R_F16_0]], 0xffff, v[[R_F16_0]]
|
||||
; SIVI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; VI-DAG: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; VI-DAG: v_rndne_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
|
||||
; SIVI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; VI-NOT: v_and_b32
|
||||
; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
|
||||
; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
|
||||
|
@ -30,10 +30,9 @@ entry:
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||
|
||||
|
||||
; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; VI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||
; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||
|
||||
|
||||
; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]]
|
||||
|
@ -31,9 +31,10 @@ entry:
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI: v_sqrt_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_sqrt_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; VI: v_sqrt_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; VI-DAG: v_sqrt_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; VI-DAG: v_sqrt_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-NOT: v_and_b32
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
|
@ -31,9 +31,10 @@ entry:
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI: v_trunc_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_trunc_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; VI: v_trunc_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; VI-DAG: v_trunc_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
|
||||
; VI-DAG: v_trunc_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
|
||||
; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN-NOT: v_and_b32
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
|
||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
|
@ -162,10 +162,9 @@ entry:
|
||||
; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
|
||||
; NOSDWA-NOT: v_mul_f16_sdwa
|
||||
|
||||
; SDWA: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
|
||||
; SDWA-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]]
|
||||
define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
|
||||
entry:
|
||||
%a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
|
||||
@ -185,8 +184,8 @@ entry:
|
||||
|
||||
; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
||||
define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) {
|
||||
entry:
|
||||
@ -209,10 +208,10 @@ entry:
|
||||
; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
|
||||
; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; SDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
||||
define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) {
|
||||
entry:
|
||||
|
@ -317,15 +317,20 @@ entry:
|
||||
; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]]
|
||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
|
||||
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; VI-NOT: and
|
||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
|
||||
|
||||
; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||
; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||
; VI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
||||
; VI: v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]]
|
||||
; VI: v_mac_f16_e32 v[[C_F16_1]], v[[B_F16_1]], v[[A_F16_1]]
|
||||
; VI: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[C_V2_F16]]
|
||||
; VI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]]
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
|
||||
|
||||
; VI-DAG: v_mac_f16_e32 v[[C_F16_1]], v[[B_F16_1]], v[[A_F16_1]]
|
||||
; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_1:[0-9]+]], 16, v[[C_F16_1]]
|
||||
; VI-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]]
|
||||
; VI-NOT: and
|
||||
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[C_V2_F16]]
|
||||
|
||||
|
||||
; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]]
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @mac_v2f16(
|
||||
@ -352,8 +357,8 @@ entry:
|
||||
; SI: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD0:v[0-9]+]]
|
||||
; VI: v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD1:v[0-9]+]]
|
||||
; VI: v_mac_f16_e32 [[ADD0]], v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; VI: v_mac_f16_e32 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; VI-DAG: v_mac_f16_e32 [[ADD0]], v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; VI-DAG: v_mac_f16_e32 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @mac_v2f16_same_add(
|
||||
<2 x half> addrspace(1)* %r0,
|
||||
|
Loading…
Reference in New Issue
Block a user