mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 03:02:36 +01:00
AMDGPU: Use v_med3_{f16|i16|u16}
llvm-svn: 296401
This commit is contained in:
parent
f2e8848268
commit
824e186e4d
@ -664,9 +664,10 @@ class ROTRPattern <Instruction BIT_ALIGN> : Pat <
|
||||
class IntMed3Pat<Instruction med3Inst,
|
||||
SDPatternOperator max,
|
||||
SDPatternOperator max_oneuse,
|
||||
SDPatternOperator min_oneuse> : Pat<
|
||||
(max (min_oneuse i32:$src0, i32:$src1),
|
||||
(min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)),
|
||||
SDPatternOperator min_oneuse,
|
||||
ValueType vt = i32> : Pat<
|
||||
(max (min_oneuse vt:$src0, vt:$src1),
|
||||
(min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)),
|
||||
(med3Inst $src0, $src1, $src2)
|
||||
>;
|
||||
|
||||
|
@ -276,6 +276,10 @@ public:
|
||||
return (getGeneration() >= EVERGREEN);
|
||||
}
|
||||
|
||||
bool hasMed3_16() const {
|
||||
return getGeneration() >= GFX9;
|
||||
}
|
||||
|
||||
bool hasCARRY() const {
|
||||
return (getGeneration() >= EVERGREEN);
|
||||
}
|
||||
|
@ -4069,8 +4069,9 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
|
||||
}
|
||||
}
|
||||
|
||||
static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
|
||||
SDValue Op0, SDValue Op1, bool Signed) {
|
||||
SDValue SITargetLowering::performIntMed3ImmCombine(
|
||||
SelectionDAG &DAG, const SDLoc &SL,
|
||||
SDValue Op0, SDValue Op1, bool Signed) const {
|
||||
ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
|
||||
if (!K1)
|
||||
return SDValue();
|
||||
@ -4088,23 +4089,22 @@ static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
|
||||
}
|
||||
|
||||
EVT VT = K0->getValueType(0);
|
||||
unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
|
||||
if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
|
||||
return DAG.getNode(Med3Opc, SL, VT,
|
||||
Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
|
||||
}
|
||||
|
||||
// If there isn't a 16-bit med3 operation, convert to 32-bit.
|
||||
MVT NVT = MVT::i32;
|
||||
unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
|
||||
|
||||
SDValue Tmp1, Tmp2, Tmp3;
|
||||
Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
|
||||
Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
|
||||
Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
|
||||
SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
|
||||
SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
|
||||
SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
|
||||
|
||||
if (VT == MVT::i16) {
|
||||
Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT,
|
||||
Tmp1, Tmp2, Tmp3);
|
||||
|
||||
return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1);
|
||||
} else
|
||||
return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
|
||||
Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
|
||||
SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
|
||||
return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
|
||||
}
|
||||
|
||||
static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
|
||||
@ -4141,9 +4141,8 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
|
||||
return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
|
||||
}
|
||||
|
||||
// No med3 for f16, but clamp is possible.
|
||||
// TODO: gfx9 has med3 f16
|
||||
if (VT == MVT::f16 || VT == MVT::f64)
|
||||
// med3 for f16 is only available on gfx9+.
|
||||
if (VT == MVT::f64 || (VT == MVT::f16 && !Subtarget->hasMed3_16()))
|
||||
return SDValue();
|
||||
|
||||
// This isn't safe with signaling NaNs because in IEEE mode, min/max on a
|
||||
|
@ -86,6 +86,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
|
||||
|
||||
SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
|
||||
SDValue Op0, SDValue Op1) const;
|
||||
SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
|
||||
SDValue Op0, SDValue Op1, bool Signed) const;
|
||||
SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
|
||||
|
@ -1321,7 +1321,7 @@ def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
|
||||
def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
|
||||
def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>;
|
||||
|
||||
def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>;
|
||||
def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
|
||||
def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
|
||||
|
||||
def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>;
|
||||
|
@ -1216,6 +1216,14 @@ def : Pat <
|
||||
// Miscellaneous Optimization Patterns
|
||||
//============================================================================//
|
||||
|
||||
// Undo sub x, c -> add x, -c canonicalization since c is more likely
|
||||
// an inline immediate than -c.
|
||||
// TODO: Also do for 64-bit.
|
||||
def : Pat<
|
||||
(add i32:$src0, (i32 NegSubInlineConst32:$src1)),
|
||||
(S_SUB_I32 $src0, NegSubInlineConst32:$src1)
|
||||
>;
|
||||
|
||||
def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
|
||||
|
||||
def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
|
||||
@ -1235,14 +1243,11 @@ class FPMed3Pat<ValueType vt,
|
||||
|
||||
def : FPMed3Pat<f32, V_MED3_F32>;
|
||||
|
||||
|
||||
// Undo sub x, c -> add x, -c canonicalization since c is more likely
|
||||
// an inline immediate than -c.
|
||||
// TODO: Also do for 64-bit.
|
||||
def : Pat<
|
||||
(add i32:$src0, (i32 NegSubInlineConst32:$src1)),
|
||||
(S_SUB_I32 $src0, NegSubInlineConst32:$src1)
|
||||
>;
|
||||
let Predicates = [isGFX9] in {
|
||||
def : FPMed3Pat<f16, V_MED3_F16>;
|
||||
def : IntMed3Pat<V_MED3_I16, smax, smax_oneuse, smin_oneuse, i16>;
|
||||
def : IntMed3Pat<V_MED3_U16, umax, umax_oneuse, umin_oneuse, i16>;
|
||||
} // End Predicates = [isGFX9]
|
||||
|
||||
//============================================================================//
|
||||
// Assembler aliases
|
||||
|
@ -258,8 +258,8 @@ def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>;
|
||||
|
||||
let Predicates = [isVI] in {
|
||||
|
||||
multiclass Tenary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
|
||||
Instruction inst, SDPatternOperator op3> {
|
||||
multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
|
||||
Instruction inst, SDPatternOperator op3> {
|
||||
def : Pat<
|
||||
(op2 (op1 i16:$src0, i16:$src1), i16:$src2),
|
||||
(inst i16:$src0, i16:$src1, i16:$src2)
|
||||
@ -278,8 +278,8 @@ def : Pat<
|
||||
>;
|
||||
}
|
||||
|
||||
defm: Tenary_i16_Pats<mul, add, V_MAD_U16, zext>;
|
||||
defm: Tenary_i16_Pats<mul, add, V_MAD_I16, sext>;
|
||||
defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>;
|
||||
defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>;
|
||||
|
||||
} // End Predicates = [isVI]
|
||||
|
||||
@ -291,6 +291,10 @@ def V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
|
||||
def V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
|
||||
def V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
|
||||
def V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
|
||||
|
||||
def V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmed3>;
|
||||
def V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmed3>;
|
||||
def V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumed3>;
|
||||
}
|
||||
|
||||
|
||||
@ -487,3 +491,7 @@ defm V_LSHL_OR_B32 : VOP3_Real_vi <0x200>;
|
||||
defm V_AND_OR_B32 : VOP3_Real_vi <0x201>;
|
||||
defm V_OR3_B32 : VOP3_Real_vi <0x202>;
|
||||
defm V_PACK_B32_F16 : VOP3_Real_vi <0x2a0>;
|
||||
|
||||
defm V_MED3_F16 : VOP3_Real_vi <0x1fa>;
|
||||
defm V_MED3_I16 : VOP3_Real_vi <0x1fb>;
|
||||
defm V_MED3_U16 : VOP3_Real_vi <0x1fc>;
|
||||
|
@ -1,5 +1,10 @@
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN %s
|
||||
; RUN: llc -march=amdgcn -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN %s
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
|
||||
|
||||
|
||||
; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f32:
|
||||
; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v{{[0-9]+}}
|
||||
@ -688,8 +693,8 @@ define void @v_test_global_nnans_med3_f32_pat15(float addrspace(1)* %out, float
|
||||
; ---------------------------------------------------------------------
|
||||
|
||||
; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use0:
|
||||
; GCN: v_min_f32
|
||||
; GCN: v_max_f32
|
||||
; GCN-DAG: v_min_f32
|
||||
; GCN-DAG: v_max_f32
|
||||
; GCN: v_min_f32
|
||||
; GCN: v_max_f32
|
||||
define void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
|
||||
@ -884,12 +889,86 @@ define void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float add
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f16:
|
||||
; SI: v_cvt_f32_f16
|
||||
; SI: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
|
||||
; SI: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
|
||||
; SI: v_cvt_f16_f32
|
||||
|
||||
; VI: v_add_f16_e32 v{{[0-9]+}}, 1.0
|
||||
; VI: v_max_f16_e32 v{{[0-9]+}}, 2.0
|
||||
; VI: v_min_f16_e32 v{{[0-9]+}}, 4.0
|
||||
|
||||
; GFX9: v_add_f16_e32 v{{[0-9]+}}, 1.0
|
||||
; GFX9: v_med3_f16 v{{[0-9]+}}, [[ADD]], 2.0, 4.0
|
||||
define void @v_test_nnan_input_fmed3_r_i_i_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #1 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
|
||||
%outgep = getelementptr half, half addrspace(1)* %out, i32 %tid
|
||||
%a = load half, half addrspace(1)* %gep0
|
||||
%a.add = fadd nnan half %a, 1.0
|
||||
%max = call half @llvm.maxnum.f16(half %a.add, half 2.0)
|
||||
%med = call half @llvm.minnum.f16(half %max, half 4.0)
|
||||
|
||||
store half %med, half addrspace(1)* %outgep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_nnan_inputs_med3_f16_pat0:
|
||||
; GCN: {{buffer_|flat_}}load_ushort [[A:v[0-9]+]]
|
||||
; GCN: {{buffer_|flat_}}load_ushort [[B:v[0-9]+]]
|
||||
; GCN: {{buffer_|flat_}}load_ushort [[C:v[0-9]+]]
|
||||
|
||||
; SI: v_cvt_f32_f16
|
||||
; SI: v_cvt_f32_f16
|
||||
; SI: v_add_f32_e32
|
||||
; SI: v_add_f32_e32
|
||||
; SI: v_add_f32_e32
|
||||
; SI: v_med3_f32
|
||||
; SI: v_cvt_f16_f32_e32
|
||||
|
||||
|
||||
; GFX89-DAG: v_add_f16_e32 [[A_ADD:v[0-9]+]], 1.0, [[A]]
|
||||
; GFX89-DAG: v_add_f16_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]]
|
||||
; GFX89-DAG: v_add_f16_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]]
|
||||
|
||||
; VI-DAG: v_min_f16
|
||||
; VI-DAG: v_max_f16
|
||||
; VI: v_min_f16
|
||||
; VI: v_max_f16
|
||||
|
||||
; GFX9: v_med3_f16 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]]
|
||||
define void @v_nnan_inputs_med3_f16_pat0(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #1 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
|
||||
%gep1 = getelementptr half, half addrspace(1)* %bptr, i32 %tid
|
||||
%gep2 = getelementptr half, half addrspace(1)* %cptr, i32 %tid
|
||||
%outgep = getelementptr half, half addrspace(1)* %out, i32 %tid
|
||||
%a = load volatile half, half addrspace(1)* %gep0
|
||||
%b = load volatile half, half addrspace(1)* %gep1
|
||||
%c = load volatile half, half addrspace(1)* %gep2
|
||||
|
||||
%a.nnan = fadd nnan half %a, 1.0
|
||||
%b.nnan = fadd nnan half %b, 2.0
|
||||
%c.nnan = fadd nnan half %c, 4.0
|
||||
|
||||
%tmp0 = call half @llvm.minnum.f16(half %a.nnan, half %b.nnan)
|
||||
%tmp1 = call half @llvm.maxnum.f16(half %a.nnan, half %b.nnan)
|
||||
%tmp2 = call half @llvm.minnum.f16(half %tmp1, half %c.nnan)
|
||||
%med3 = call half @llvm.maxnum.f16(half %tmp0, half %tmp2)
|
||||
store half %med3, half addrspace(1)* %outgep
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #0
|
||||
declare float @llvm.fabs.f32(float) #0
|
||||
declare float @llvm.minnum.f32(float, float) #0
|
||||
declare float @llvm.maxnum.f32(float, float) #0
|
||||
declare double @llvm.minnum.f64(double, double) #0
|
||||
declare double @llvm.maxnum.f64(double, double) #0
|
||||
declare half @llvm.fabs.f16(half) #0
|
||||
declare half @llvm.minnum.f16(half, half) #0
|
||||
declare half @llvm.maxnum.f16(half, half) #0
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
|
||||
|
39
test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll
Normal file
39
test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll
Normal file
@ -0,0 +1,39 @@
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
||||
|
||||
; GCN-LABEL: {{^}}test_fmed3_f16:
|
||||
; GCN: v_med3_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
define void @test_fmed3_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
|
||||
%src0.f16 = trunc i32 %src0.arg to i16
|
||||
%src0 = bitcast i16 %src0.f16 to half
|
||||
%src1.f16 = trunc i32 %src1.arg to i16
|
||||
%src1 = bitcast i16 %src1.f16 to half
|
||||
%src2.f16 = trunc i32 %src2.arg to i16
|
||||
%src2 = bitcast i16 %src2.f16 to half
|
||||
%mad = call half @llvm.amdgcn.fmed3.f16(half %src0, half %src1, half %src2)
|
||||
store half %mad, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}test_fmed3_srcmods_f16:
|
||||
; GCN: v_med3_f16 v{{[0-9]+}}, -s{{[0-9]+}}, |v{{[0-9]+}}|, -|v{{[0-9]+}}|
|
||||
define void @test_fmed3_srcmods_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
|
||||
%src0.f16 = trunc i32 %src0.arg to i16
|
||||
%src0 = bitcast i16 %src0.f16 to half
|
||||
%src1.f16 = trunc i32 %src1.arg to i16
|
||||
%src1 = bitcast i16 %src1.f16 to half
|
||||
%src2.f16 = trunc i32 %src2.arg to i16
|
||||
%src2 = bitcast i16 %src2.f16 to half
|
||||
%src0.fneg = fsub half -0.0, %src0
|
||||
%src1.fabs = call half @llvm.fabs.f16(half %src1)
|
||||
%src2.fabs = call half @llvm.fabs.f16(half %src2)
|
||||
%src2.fneg.fabs = fsub half -0.0, %src2.fabs
|
||||
%mad = call half @llvm.amdgcn.fmed3.f16(half %src0.fneg, half %src1.fabs, half %src2.fneg.fabs)
|
||||
store half %mad, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
declare half @llvm.amdgcn.fmed3.f16(half, half, half) #0
|
||||
declare half @llvm.fabs.f16(half) #0
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
attributes #1 = { nounwind }
|
@ -1,12 +1,13 @@
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
|
||||
|
||||
declare i32 @llvm.r600.read.tidig.x() #0
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #0
|
||||
|
||||
; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i32:
|
||||
; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
|
||||
define void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
|
||||
%tid = call i32 @llvm.r600.read.tidig.x()
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
|
||||
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
|
||||
%a = load i32, i32 addrspace(1)* %gep0
|
||||
@ -25,7 +26,7 @@ define void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a
|
||||
; GCN: v_max_i32
|
||||
; GCN: v_min_i32
|
||||
define void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
|
||||
%tid = call i32 @llvm.r600.read.tidig.x()
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
|
||||
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
|
||||
%a = load i32, i32 addrspace(1)* %gep0
|
||||
@ -45,7 +46,7 @@ define void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrsp
|
||||
; GCN: v_max_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
|
||||
; GCN: v_min_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
|
||||
define void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
|
||||
%tid = call i32 @llvm.r600.read.tidig.x()
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
|
||||
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
|
||||
%a = load i32, i32 addrspace(1)* %gep0
|
||||
@ -64,7 +65,7 @@ define void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 a
|
||||
; GCN: v_max_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
|
||||
; GCN: v_min_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
|
||||
define void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
|
||||
%tid = call i32 @llvm.r600.read.tidig.x()
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
|
||||
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
|
||||
%a = load i32, i32 addrspace(1)* %gep0
|
||||
@ -83,7 +84,7 @@ define void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 ad
|
||||
; GCN: v_cmp_lt_i64
|
||||
; GCN: v_cmp_gt_i64
|
||||
define void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
|
||||
%tid = call i32 @llvm.r600.read.tidig.x()
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
|
||||
%outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
|
||||
%a = load i64, i64 addrspace(1)* %gep0
|
||||
@ -99,9 +100,10 @@ define void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i16:
|
||||
; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
|
||||
; SICIVI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
|
||||
; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
|
||||
define void @v_test_smed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
|
||||
%tid = call i32 @llvm.r600.read.tidig.x()
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
|
||||
%outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
|
||||
%a = load i16, i16 addrspace(1)* %gep0
|
||||
@ -362,6 +364,7 @@ bb:
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Should keep scalar or not promote
|
||||
; GCN-LABEL: {{^}}s_test_smed3_i16_pat_0:
|
||||
; GCN: s_sext_i32_i16
|
||||
; GCN: s_sext_i32_i16
|
||||
@ -444,6 +447,35 @@ bb:
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_test_smed3_i16_pat_0:
|
||||
; SI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
||||
; FIXME: VI not matching med3
|
||||
; VI: v_min_i16
|
||||
; VI: v_max_i16
|
||||
; VI: v_min_i16
|
||||
; VI: v_max_i16
|
||||
|
||||
; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
define void @v_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
|
||||
bb:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid
|
||||
%gep1 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 3
|
||||
%gep2 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 8
|
||||
%out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
|
||||
%x = load i16, i16 addrspace(1)* %gep0
|
||||
%y = load i16, i16 addrspace(1)* %gep1
|
||||
%z = load i16, i16 addrspace(1)* %gep2
|
||||
|
||||
%tmp0 = call i16 @smin16(i16 %x, i16 %y)
|
||||
%tmp1 = call i16 @smax16(i16 %x, i16 %y)
|
||||
%tmp2 = call i16 @smin16(i16 %tmp1, i16 %z)
|
||||
%tmp3 = call i16 @smax16(i16 %tmp0, i16 %tmp2)
|
||||
store i16 %tmp3, i16 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
attributes #1 = { nounwind }
|
||||
attributes #2 = { nounwind readnone alwaysinline }
|
||||
|
@ -1,12 +1,13 @@
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
|
||||
|
||||
declare i32 @llvm.r600.read.tidig.x() #0
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #0
|
||||
|
||||
; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i32:
|
||||
; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
|
||||
define void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
|
||||
%tid = call i32 @llvm.r600.read.tidig.x()
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
|
||||
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
|
||||
%a = load i32, i32 addrspace(1)* %gep0
|
||||
@ -25,7 +26,7 @@ define void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a
|
||||
; GCN: v_max_u32
|
||||
; GCN: v_min_u32
|
||||
define void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
|
||||
%tid = call i32 @llvm.r600.read.tidig.x()
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
|
||||
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
|
||||
%a = load i32, i32 addrspace(1)* %gep0
|
||||
@ -45,7 +46,7 @@ define void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrsp
|
||||
; GCN: v_max_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
|
||||
; GCN: v_min_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
|
||||
define void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
|
||||
%tid = call i32 @llvm.r600.read.tidig.x()
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
|
||||
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
|
||||
%a = load i32, i32 addrspace(1)* %gep0
|
||||
@ -64,7 +65,7 @@ define void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 a
|
||||
; GCN: v_max_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
|
||||
; GCN: v_min_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
|
||||
define void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
|
||||
%tid = call i32 @llvm.r600.read.tidig.x()
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
|
||||
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
|
||||
%a = load i32, i32 addrspace(1)* %gep0
|
||||
@ -83,7 +84,7 @@ define void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 ad
|
||||
; GCN: v_cmp_lt_u64
|
||||
; GCN: v_cmp_gt_u64
|
||||
define void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
|
||||
%tid = call i32 @llvm.r600.read.tidig.x()
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
|
||||
%outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
|
||||
%a = load i64, i64 addrspace(1)* %gep0
|
||||
@ -99,9 +100,10 @@ define void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i16:
|
||||
; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
|
||||
; SICIVI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
|
||||
; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
|
||||
define void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
|
||||
%tid = call i32 @llvm.r600.read.tidig.x()
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
|
||||
%outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
|
||||
%a = load i16, i16 addrspace(1)* %gep0
|
||||
@ -479,6 +481,35 @@ bb:
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_test_umed3_i16_pat_0:
|
||||
; SI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
|
||||
; FIXME: VI not matching med3
|
||||
; VI: v_min_u16
|
||||
; VI: v_max_u16
|
||||
; VI: v_min_u16
|
||||
; VI: v_max_u16
|
||||
|
||||
; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
define void @v_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
|
||||
bb:
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid
|
||||
%gep1 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 3
|
||||
%gep2 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 8
|
||||
%out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
|
||||
%x = load i16, i16 addrspace(1)* %gep0
|
||||
%y = load i16, i16 addrspace(1)* %gep1
|
||||
%z = load i16, i16 addrspace(1)* %gep2
|
||||
|
||||
%tmp0 = call i16 @umin16(i16 %x, i16 %y)
|
||||
%tmp1 = call i16 @umax16(i16 %x, i16 %y)
|
||||
%tmp2 = call i16 @umin16(i16 %tmp1, i16 %z)
|
||||
%tmp3 = call i16 @umax16(i16 %tmp0, i16 %tmp2)
|
||||
store i16 %tmp3, i16 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
attributes #1 = { nounwind }
|
||||
attributes #2 = { nounwind readnone alwaysinline }
|
||||
|
@ -30,3 +30,15 @@ v_or3_b32 v1, v2, v3, v4
|
||||
v_pack_b32_f16 v1, v2, v3
|
||||
// GFX9: v_pack_b32_f16 v1, v2, v3 ; encoding: [0x01,0x00,0xa0,0xd2,0x02,0x07,0x02,0x00]
|
||||
// NOVI: :1: error: instruction not supported on this GPU
|
||||
|
||||
v_med3_f16 v1, v2, v3, v4
|
||||
// GFX9: v_med3_f16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xfa,0xd1,0x02,0x07,0x12,0x04]
|
||||
// NOVI: :1: error: instruction not supported on this GPU
|
||||
|
||||
v_med3_i16 v1, v2, v3, v4
|
||||
// GFX9: v_med3_i16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xfb,0xd1,0x02,0x07,0x12,0x04]
|
||||
// NOVI: :1: error: instruction not supported on this GPU
|
||||
|
||||
v_med3_u16 v1, v2, v3, v4
|
||||
// GFX9: v_med3_u16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xfc,0xd1,0x02,0x07,0x12,0x04]
|
||||
// NOVI: :1: error: instruction not supported on this GPU
|
||||
|
Loading…
Reference in New Issue
Block a user