mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
AMDGPU: Add intrinsics llvm.amdgcn.cvt.{pknorm.i16, pknorm.u16, pk.i16, pk.u16}
Reviewers: arsenm, nhaehnle Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, t-tye Differential Revision: https://reviews.llvm.org/D41663 llvm-svn: 323908
This commit is contained in:
parent
2608200700
commit
1325dff7c0
@ -238,6 +238,26 @@ def int_amdgcn_cvt_pkrtz : Intrinsic<
|
||||
[IntrNoMem, IntrSpeculatable]
|
||||
>;
|
||||
|
||||
def int_amdgcn_cvt_pknorm_i16 : Intrinsic<
|
||||
[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
|
||||
[IntrNoMem, IntrSpeculatable]
|
||||
>;
|
||||
|
||||
def int_amdgcn_cvt_pknorm_u16 : Intrinsic<
|
||||
[llvm_v2i16_ty], [llvm_float_ty, llvm_float_ty],
|
||||
[IntrNoMem, IntrSpeculatable]
|
||||
>;
|
||||
|
||||
def int_amdgcn_cvt_pk_i16 : Intrinsic<
|
||||
[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrNoMem, IntrSpeculatable]
|
||||
>;
|
||||
|
||||
def int_amdgcn_cvt_pk_u16 : Intrinsic<
|
||||
[llvm_v2i16_ty], [llvm_i32_ty, llvm_i32_ty],
|
||||
[IntrNoMem, IntrSpeculatable]
|
||||
>;
|
||||
|
||||
def int_amdgcn_class : Intrinsic<
|
||||
[llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty],
|
||||
[IntrNoMem, IntrSpeculatable]
|
||||
|
@ -3957,6 +3957,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
||||
NODE_NAME_CASE(CVT_F32_UBYTE2)
|
||||
NODE_NAME_CASE(CVT_F32_UBYTE3)
|
||||
NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
|
||||
NODE_NAME_CASE(CVT_PKNORM_I16_F32)
|
||||
NODE_NAME_CASE(CVT_PKNORM_U16_F32)
|
||||
NODE_NAME_CASE(CVT_PK_I16_I32)
|
||||
NODE_NAME_CASE(CVT_PK_U16_U32)
|
||||
NODE_NAME_CASE(FP_TO_FP16)
|
||||
NODE_NAME_CASE(FP16_ZEXT)
|
||||
NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
|
||||
|
@ -417,6 +417,10 @@ enum NodeType : unsigned {
|
||||
// Convert two float 32 numbers into a single register holding two packed f16
|
||||
// with round to zero.
|
||||
CVT_PKRTZ_F16_F32,
|
||||
CVT_PKNORM_I16_F32,
|
||||
CVT_PKNORM_U16_F32,
|
||||
CVT_PK_I16_I32,
|
||||
CVT_PK_U16_U32,
|
||||
|
||||
// Same as the standard node, except the high bits of the resulting integer
|
||||
// are known 0.
|
||||
|
@ -35,6 +35,10 @@ def AMDGPUFPPackOp : SDTypeProfile<1, 2,
|
||||
[SDTCisFP<1>, SDTCisSameAs<1, 2>]
|
||||
>;
|
||||
|
||||
def AMDGPUIntPackOp : SDTypeProfile<1, 2,
|
||||
[SDTCisInt<1>, SDTCisSameAs<1, 2>]
|
||||
>;
|
||||
|
||||
def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
|
||||
[SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
|
||||
>;
|
||||
@ -142,6 +146,10 @@ def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
|
||||
def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
|
||||
|
||||
def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
|
||||
def AMDGPUpknorm_i16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>;
|
||||
def AMDGPUpknorm_u16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>;
|
||||
def AMDGPUpk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
|
||||
def AMDGPUpk_u16_u32 : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
|
||||
def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
|
||||
def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
|
||||
|
||||
|
@ -205,6 +205,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
||||
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
|
||||
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
|
||||
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
|
||||
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
|
||||
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
|
||||
|
||||
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
|
||||
@ -3870,7 +3871,8 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
|
||||
}
|
||||
case ISD::INTRINSIC_WO_CHAIN: {
|
||||
unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
|
||||
if (IID == Intrinsic::amdgcn_cvt_pkrtz) {
|
||||
switch (IID) {
|
||||
case Intrinsic::amdgcn_cvt_pkrtz: {
|
||||
SDValue Src0 = N->getOperand(1);
|
||||
SDValue Src1 = N->getOperand(2);
|
||||
SDLoc SL(N);
|
||||
@ -3879,6 +3881,29 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
|
||||
Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
|
||||
return;
|
||||
}
|
||||
case Intrinsic::amdgcn_cvt_pknorm_i16:
|
||||
case Intrinsic::amdgcn_cvt_pknorm_u16:
|
||||
case Intrinsic::amdgcn_cvt_pk_i16:
|
||||
case Intrinsic::amdgcn_cvt_pk_u16: {
|
||||
SDValue Src0 = N->getOperand(1);
|
||||
SDValue Src1 = N->getOperand(2);
|
||||
SDLoc SL(N);
|
||||
unsigned Opcode;
|
||||
|
||||
if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
|
||||
Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
|
||||
else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
|
||||
Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
|
||||
else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
|
||||
Opcode = AMDGPUISD::CVT_PK_I16_I32;
|
||||
else
|
||||
Opcode = AMDGPUISD::CVT_PK_U16_U32;
|
||||
|
||||
SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
|
||||
Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
|
||||
return;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
case ISD::INTRINSIC_W_CHAIN: {
|
||||
@ -4787,10 +4812,27 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
||||
case Intrinsic::amdgcn_ubfe:
|
||||
return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
|
||||
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
|
||||
case Intrinsic::amdgcn_cvt_pkrtz: {
|
||||
// FIXME: Stop adding cast if v2f16 legal.
|
||||
case Intrinsic::amdgcn_cvt_pkrtz:
|
||||
case Intrinsic::amdgcn_cvt_pknorm_i16:
|
||||
case Intrinsic::amdgcn_cvt_pknorm_u16:
|
||||
case Intrinsic::amdgcn_cvt_pk_i16:
|
||||
case Intrinsic::amdgcn_cvt_pk_u16: {
|
||||
// FIXME: Stop adding cast if v2f16/v2i16 are legal.
|
||||
EVT VT = Op.getValueType();
|
||||
SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
|
||||
unsigned Opcode;
|
||||
|
||||
if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
|
||||
Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
|
||||
else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
|
||||
Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
|
||||
else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
|
||||
Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
|
||||
else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
|
||||
Opcode = AMDGPUISD::CVT_PK_I16_I32;
|
||||
else
|
||||
Opcode = AMDGPUISD::CVT_PK_U16_U32;
|
||||
|
||||
SDValue Node = DAG.getNode(Opcode, DL, MVT::i32,
|
||||
Op.getOperand(1), Op.getOperand(2));
|
||||
return DAG.getNode(ISD::BITCAST, DL, VT, Node);
|
||||
}
|
||||
|
@ -408,11 +408,11 @@ defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32
|
||||
defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
|
||||
defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
|
||||
defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
|
||||
defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
|
||||
defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>>;
|
||||
defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_i16_f32>;
|
||||
defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpknorm_u16_f32>;
|
||||
defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_I32_F32_F32>, AMDGPUpkrtz_f16_f32>;
|
||||
defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>>;
|
||||
defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>>;
|
||||
defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_u16_u32>;
|
||||
defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_I32_I32_I32>, AMDGPUpk_i16_i32>;
|
||||
|
||||
} // End SubtargetPredicate = isGCN
|
||||
|
||||
|
@ -3264,6 +3264,18 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
|
||||
|
||||
break;
|
||||
}
|
||||
case Intrinsic::amdgcn_cvt_pknorm_i16:
|
||||
case Intrinsic::amdgcn_cvt_pknorm_u16:
|
||||
case Intrinsic::amdgcn_cvt_pk_i16:
|
||||
case Intrinsic::amdgcn_cvt_pk_u16: {
|
||||
Value *Src0 = II->getArgOperand(0);
|
||||
Value *Src1 = II->getArgOperand(1);
|
||||
|
||||
if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
|
||||
return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
|
||||
|
||||
break;
|
||||
}
|
||||
case Intrinsic::amdgcn_ubfe:
|
||||
case Intrinsic::amdgcn_sbfe: {
|
||||
// Decompose simple cases into standard shifts.
|
||||
|
84
test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
Normal file
84
test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.i16.ll
Normal file
@ -0,0 +1,84 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
||||
|
||||
; GCN-LABEL: {{^}}s_cvt_pk_i16_i32:
|
||||
; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
|
||||
; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
|
||||
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
|
||||
; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, [[X]], [[VY]]
|
||||
; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, [[X]], [[VY]]
|
||||
define amdgpu_kernel void @s_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
|
||||
%result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %y)
|
||||
%r = bitcast <2 x i16> %result to i32
|
||||
store i32 %r, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}s_cvt_pk_i16_samereg_i32:
|
||||
; GCN: s_load_dword [[X:s[0-9]+]]
|
||||
; GCN: v_cvt_pk_i16_i32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
|
||||
define amdgpu_kernel void @s_cvt_pk_i16_samereg_i32(i32 addrspace(1)* %out, i32 %x) #0 {
|
||||
%result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %x)
|
||||
%r = bitcast <2 x i16> %result to i32
|
||||
store i32 %r, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_cvt_pk_i16_i32:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
||||
; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, [[A]], [[B]]
|
||||
; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, [[A]], [[B]]
|
||||
define amdgpu_kernel void @v_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile i32, i32 addrspace(1)* %a.gep
|
||||
%b = load volatile i32, i32 addrspace(1)* %b.gep
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %a, i32 %b)
|
||||
%r = bitcast <2 x i16> %cvt to i32
|
||||
store i32 %r, i32 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_cvt_pk_i16_i32_reg_imm:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: v_cvt_pk_i16_i32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1
|
||||
define amdgpu_kernel void @v_cvt_pk_i16_i32_reg_imm(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile i32, i32 addrspace(1)* %a.gep
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %a, i32 1)
|
||||
%r = bitcast <2 x i16> %cvt to i32
|
||||
store i32 %r, i32 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_cvt_pk_i16_i32_imm_reg:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, 1, [[A]]
|
||||
; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, 1, [[A]]
|
||||
define amdgpu_kernel void @v_cvt_pk_i16_i32_imm_reg(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile i32, i32 addrspace(1)* %a.gep
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 1, i32 %a)
|
||||
%r = bitcast <2 x i16> %cvt to i32
|
||||
store i32 %r, i32 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32, i32) #1
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
84
test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
Normal file
84
test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.u16.ll
Normal file
@ -0,0 +1,84 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
||||
|
||||
; GCN-LABEL: {{^}}s_cvt_pk_u16_u32:
|
||||
; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
|
||||
; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
|
||||
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
|
||||
; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, [[X]], [[VY]]
|
||||
; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, [[X]], [[VY]]
|
||||
define amdgpu_kernel void @s_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
|
||||
%result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %y)
|
||||
%r = bitcast <2 x i16> %result to i32
|
||||
store i32 %r, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}s_cvt_pk_u16_samereg_i32:
|
||||
; GCN: s_load_dword [[X:s[0-9]+]]
|
||||
; GCN: v_cvt_pk_u16_u32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
|
||||
define amdgpu_kernel void @s_cvt_pk_u16_samereg_i32(i32 addrspace(1)* %out, i32 %x) #0 {
|
||||
%result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %x)
|
||||
%r = bitcast <2 x i16> %result to i32
|
||||
store i32 %r, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_cvt_pk_u16_u32:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
||||
; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, [[A]], [[B]]
|
||||
; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, [[A]], [[B]]
|
||||
define amdgpu_kernel void @v_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile i32, i32 addrspace(1)* %a.gep
|
||||
%b = load volatile i32, i32 addrspace(1)* %b.gep
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %a, i32 %b)
|
||||
%r = bitcast <2 x i16> %cvt to i32
|
||||
store i32 %r, i32 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_cvt_pk_u16_u32_reg_imm:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: v_cvt_pk_u16_u32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1
|
||||
define amdgpu_kernel void @v_cvt_pk_u16_u32_reg_imm(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile i32, i32 addrspace(1)* %a.gep
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %a, i32 1)
|
||||
%r = bitcast <2 x i16> %cvt to i32
|
||||
store i32 %r, i32 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_cvt_pk_u16_u32_imm_reg:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, 1, [[A]]
|
||||
; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, 1, [[A]]
|
||||
define amdgpu_kernel void @v_cvt_pk_u16_u32_imm_reg(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile i32, i32 addrspace(1)* %a.gep
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 1, i32 %a)
|
||||
%r = bitcast <2 x i16> %cvt to i32
|
||||
store i32 %r, i32 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32, i32) #1
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
164
test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
Normal file
164
test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.i16.ll
Normal file
@ -0,0 +1,164 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
||||
|
||||
; GCN-LABEL: {{^}}s_cvt_pknorm_i16_f32:
|
||||
; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
|
||||
; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
|
||||
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
|
||||
; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
|
||||
; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, [[X]], [[VY]]
|
||||
define amdgpu_kernel void @s_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
|
||||
%result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %y)
|
||||
%r = bitcast <2 x i16> %result to i32
|
||||
store i32 %r, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}s_cvt_pknorm_i16_samereg_f32:
|
||||
; GCN: s_load_dword [[X:s[0-9]+]]
|
||||
; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
|
||||
define amdgpu_kernel void @s_cvt_pknorm_i16_samereg_f32(i32 addrspace(1)* %out, float %x) #0 {
|
||||
%result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %x)
|
||||
%r = bitcast <2 x i16> %result to i32
|
||||
store i32 %r, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
||||
; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
|
||||
; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, [[A]], [[B]]
|
||||
define amdgpu_kernel void @v_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%b = load volatile float, float addrspace(1)* %b.gep
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float %b)
|
||||
%r = bitcast <2 x i16> %cvt to i32
|
||||
store i32 %r, i32 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_reg_imm:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1.0
|
||||
define amdgpu_kernel void @v_cvt_pknorm_i16_f32_reg_imm(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float 1.0)
|
||||
%r = bitcast <2 x i16> %cvt to i32
|
||||
store i32 %r, i32 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_imm_reg:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
|
||||
; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, 1.0, [[A]]
|
||||
define amdgpu_kernel void @v_cvt_pknorm_i16_f32_imm_reg(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float 1.0, float %a)
|
||||
%r = bitcast <2 x i16> %cvt to i32
|
||||
store i32 %r, i32 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_lo:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], [[B]]
|
||||
define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_lo(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%b = load volatile float, float addrspace(1)* %b.gep
|
||||
%neg.a = fsub float -0.0, %a
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.a, float %b)
|
||||
%r = bitcast <2 x i16> %cvt to i32
|
||||
store i32 %r, i32 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_hi:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], -[[B]]
|
||||
define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%b = load volatile float, float addrspace(1)* %b.gep
|
||||
%neg.b = fsub float -0.0, %b
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %a, float %neg.b)
|
||||
%r = bitcast <2 x i16> %cvt to i32
|
||||
store i32 %r, i32 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_lo_hi:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], -[[B]]
|
||||
define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_lo_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%b = load volatile float, float addrspace(1)* %b.gep
|
||||
%neg.a = fsub float -0.0, %a
|
||||
%neg.b = fsub float -0.0, %b
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.a, float %neg.b)
|
||||
%r = bitcast <2 x i16> %cvt to i32
|
||||
store i32 %r, i32 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_cvt_pknorm_i16_f32_fneg_fabs_lo_fneg_hi:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN: v_cvt_pknorm_i16_f32{{(_e64)*}} v{{[0-9]+}}, -|[[A]]|, -[[B]]
|
||||
define amdgpu_kernel void @v_cvt_pknorm_i16_f32_fneg_fabs_lo_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%b = load volatile float, float addrspace(1)* %b.gep
|
||||
%fabs.a = call float @llvm.fabs.f32(float %a)
|
||||
%neg.fabs.a = fsub float -0.0, %fabs.a
|
||||
%neg.b = fsub float -0.0, %b
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %neg.fabs.a, float %neg.b)
|
||||
%r = bitcast <2 x i16> %cvt to i32
|
||||
store i32 %r, i32 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float, float) #1
|
||||
declare float @llvm.fabs.f32(float) #1
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
164
test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
Normal file
164
test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pknorm.u16.ll
Normal file
@ -0,0 +1,164 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
||||
|
||||
; GCN-LABEL: {{^}}s_cvt_pknorm_u16_f32:
|
||||
; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
|
||||
; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
|
||||
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
|
||||
; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
|
||||
; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, [[X]], [[VY]]
|
||||
define amdgpu_kernel void @s_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
|
||||
%result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %y)
|
||||
%r = bitcast <2 x i16> %result to i32
|
||||
store i32 %r, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}s_cvt_pknorm_u16_samereg_f32:
|
||||
; GCN: s_load_dword [[X:s[0-9]+]]
|
||||
; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]]
|
||||
define amdgpu_kernel void @s_cvt_pknorm_u16_samereg_f32(i32 addrspace(1)* %out, float %x) #0 {
|
||||
%result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %x)
|
||||
%r = bitcast <2 x i16> %result to i32
|
||||
store i32 %r, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
||||
; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
|
||||
; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, [[A]], [[B]]
|
||||
define amdgpu_kernel void @v_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%b = load volatile float, float addrspace(1)* %b.gep
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float %b)
|
||||
%r = bitcast <2 x i16> %cvt to i32
|
||||
store i32 %r, i32 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_reg_imm:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1.0
|
||||
define amdgpu_kernel void @v_cvt_pknorm_u16_f32_reg_imm(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float 1.0)
|
||||
%r = bitcast <2 x i16> %cvt to i32
|
||||
store i32 %r, i32 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_imm_reg:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
|
||||
; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, 1.0, [[A]]
|
||||
define amdgpu_kernel void @v_cvt_pknorm_u16_f32_imm_reg(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float 1.0, float %a)
|
||||
%r = bitcast <2 x i16> %cvt to i32
|
||||
store i32 %r, i32 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_lo:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], [[B]]
|
||||
define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_lo(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%b = load volatile float, float addrspace(1)* %b.gep
|
||||
%neg.a = fsub float -0.0, %a
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.a, float %b)
|
||||
%r = bitcast <2 x i16> %cvt to i32
|
||||
store i32 %r, i32 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_hi:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], -[[B]]
|
||||
define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%b = load volatile float, float addrspace(1)* %b.gep
|
||||
%neg.b = fsub float -0.0, %b
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %a, float %neg.b)
|
||||
%r = bitcast <2 x i16> %cvt to i32
|
||||
store i32 %r, i32 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_lo_hi:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], -[[B]]
|
||||
define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_lo_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%b = load volatile float, float addrspace(1)* %b.gep
|
||||
%neg.a = fsub float -0.0, %a
|
||||
%neg.b = fsub float -0.0, %b
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.a, float %neg.b)
|
||||
%r = bitcast <2 x i16> %cvt to i32
|
||||
store i32 %r, i32 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_cvt_pknorm_u16_f32_fneg_fabs_lo_fneg_hi:
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
|
||||
; GCN: {{buffer|flat|global}}_load_dword [[B:v[0-9]+]]
|
||||
; GCN: v_cvt_pknorm_u16_f32{{(_e64)*}} v{{[0-9]+}}, -|[[A]]|, -[[B]]
|
||||
define amdgpu_kernel void @v_cvt_pknorm_u16_f32_fneg_fabs_lo_fneg_hi(i32 addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
|
||||
%b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %tid.ext
|
||||
%a = load volatile float, float addrspace(1)* %a.gep
|
||||
%b = load volatile float, float addrspace(1)* %b.gep
|
||||
%fabs.a = call float @llvm.fabs.f32(float %a)
|
||||
%neg.fabs.a = fsub float -0.0, %fabs.a
|
||||
%neg.b = fsub float -0.0, %b
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %neg.fabs.a, float %neg.b)
|
||||
%r = bitcast <2 x i16> %cvt to i32
|
||||
store i32 %r, i32 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float, float) #1
|
||||
declare float @llvm.fabs.f32(float) #1
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind readnone }
|
@ -722,6 +722,114 @@ define <2 x half> @constant_rtz_pkrtz() {
|
||||
ret <2 x half> %cvt
|
||||
}
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; llvm.amdgcn.cvt.pknorm.i16
|
||||
; --------------------------------------------------------------------
|
||||
|
||||
declare <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float, float) nounwind readnone
|
||||
|
||||
; CHECK-LABEL: @undef_lhs_cvt_pknorm_i16(
|
||||
; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float %y)
|
||||
define <2 x i16> @undef_lhs_cvt_pknorm_i16(float %y) {
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float %y)
|
||||
ret <2 x i16> %cvt
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @undef_rhs_cvt_pknorm_i16(
|
||||
; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float undef)
|
||||
define <2 x i16> @undef_rhs_cvt_pknorm_i16(float %x) {
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float undef)
|
||||
ret <2 x i16> %cvt
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @undef_cvt_pknorm_i16(
|
||||
; CHECK: ret <2 x i16> undef
|
||||
define <2 x i16> @undef_cvt_pknorm_i16() {
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float undef, float undef)
|
||||
ret <2 x i16> %cvt
|
||||
}
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; llvm.amdgcn.cvt.pknorm.u16
|
||||
; --------------------------------------------------------------------
|
||||
|
||||
declare <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float, float) nounwind readnone
|
||||
|
||||
; CHECK-LABEL: @undef_lhs_cvt_pknorm_u16(
|
||||
; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float %y)
|
||||
define <2 x i16> @undef_lhs_cvt_pknorm_u16(float %y) {
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float %y)
|
||||
ret <2 x i16> %cvt
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @undef_rhs_cvt_pknorm_u16(
|
||||
; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float undef)
|
||||
define <2 x i16> @undef_rhs_cvt_pknorm_u16(float %x) {
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float undef)
|
||||
ret <2 x i16> %cvt
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @undef_cvt_pknorm_u16(
|
||||
; CHECK: ret <2 x i16> undef
|
||||
define <2 x i16> @undef_cvt_pknorm_u16() {
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float undef, float undef)
|
||||
ret <2 x i16> %cvt
|
||||
}
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; llvm.amdgcn.cvt.pk.i16
|
||||
; --------------------------------------------------------------------
|
||||
|
||||
declare <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32, i32) nounwind readnone
|
||||
|
||||
; CHECK-LABEL: @undef_lhs_cvt_pk_i16(
|
||||
; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 %y)
|
||||
define <2 x i16> @undef_lhs_cvt_pk_i16(i32 %y) {
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 %y)
|
||||
ret <2 x i16> %cvt
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @undef_rhs_cvt_pk_i16(
|
||||
; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 undef)
|
||||
define <2 x i16> @undef_rhs_cvt_pk_i16(i32 %x) {
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 undef)
|
||||
ret <2 x i16> %cvt
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @undef_cvt_pk_i16(
|
||||
; CHECK: ret <2 x i16> undef
|
||||
define <2 x i16> @undef_cvt_pk_i16() {
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 undef, i32 undef)
|
||||
ret <2 x i16> %cvt
|
||||
}
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; llvm.amdgcn.cvt.pk.u16
|
||||
; --------------------------------------------------------------------
|
||||
|
||||
declare <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32, i32) nounwind readnone
|
||||
|
||||
; CHECK-LABEL: @undef_lhs_cvt_pk_u16(
|
||||
; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 %y)
|
||||
define <2 x i16> @undef_lhs_cvt_pk_u16(i32 %y) {
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 %y)
|
||||
ret <2 x i16> %cvt
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @undef_rhs_cvt_pk_u16(
|
||||
; CHECK: %cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 undef)
|
||||
define <2 x i16> @undef_rhs_cvt_pk_u16(i32 %x) {
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 undef)
|
||||
ret <2 x i16> %cvt
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @undef_cvt_pk_u16(
|
||||
; CHECK: ret <2 x i16> undef
|
||||
define <2 x i16> @undef_cvt_pk_u16() {
|
||||
%cvt = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 undef, i32 undef)
|
||||
ret <2 x i16> %cvt
|
||||
}
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; llvm.amdgcn.ubfe
|
||||
; --------------------------------------------------------------------
|
||||
|
Loading…
x
Reference in New Issue
Block a user