1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 03:02:36 +01:00

AMDGPU: Use v_med3_{f16|i16|u16}

llvm-svn: 296401
This commit is contained in:
Matt Arsenault 2017-02-27 22:40:39 +00:00
parent f2e8848268
commit 824e186e4d
12 changed files with 269 additions and 57 deletions

View File

@ -664,9 +664,10 @@ class ROTRPattern <Instruction BIT_ALIGN> : Pat <
class IntMed3Pat<Instruction med3Inst,
SDPatternOperator max,
SDPatternOperator max_oneuse,
SDPatternOperator min_oneuse> : Pat<
(max (min_oneuse i32:$src0, i32:$src1),
(min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)),
SDPatternOperator min_oneuse,
ValueType vt = i32> : Pat<
(max (min_oneuse vt:$src0, vt:$src1),
(min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)),
(med3Inst $src0, $src1, $src2)
>;

View File

@ -276,6 +276,10 @@ public:
return (getGeneration() >= EVERGREEN);
}
bool hasMed3_16() const {
return getGeneration() >= GFX9;
}
bool hasCARRY() const {
return (getGeneration() >= EVERGREEN);
}

View File

@ -4069,8 +4069,9 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
}
}
static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
SDValue Op0, SDValue Op1, bool Signed) {
SDValue SITargetLowering::performIntMed3ImmCombine(
SelectionDAG &DAG, const SDLoc &SL,
SDValue Op0, SDValue Op1, bool Signed) const {
ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
if (!K1)
return SDValue();
@ -4088,23 +4089,22 @@ static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
}
EVT VT = K0->getValueType(0);
unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
return DAG.getNode(Med3Opc, SL, VT,
Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
}
// If there isn't a 16-bit med3 operation, convert to 32-bit.
MVT NVT = MVT::i32;
unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
SDValue Tmp1, Tmp2, Tmp3;
Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
if (VT == MVT::i16) {
Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT,
Tmp1, Tmp2, Tmp3);
return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1);
} else
return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
}
static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
@ -4141,9 +4141,8 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
}
// No med3 for f16, but clamp is possible.
// TODO: gfx9 has med3 f16
if (VT == MVT::f16 || VT == MVT::f64)
// med3 for f16 is only available on gfx9+.
if (VT == MVT::f64 || (VT == MVT::f16 && !Subtarget->hasMed3_16()))
return SDValue();
// This isn't safe with signaling NaNs because in IEEE mode, min/max on a

View File

@ -86,6 +86,8 @@ class SITargetLowering final : public AMDGPUTargetLowering {
SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
SDValue Op0, SDValue Op1) const;
SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
SDValue Op0, SDValue Op1, bool Signed) const;
SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;

View File

@ -1321,7 +1321,7 @@ def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>;
def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>;
def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>;

View File

@ -1216,6 +1216,14 @@ def : Pat <
// Miscellaneous Optimization Patterns
//============================================================================//
// Undo sub x, c -> add x, -c canonicalization since c is more likely
// an inline immediate than -c.
// TODO: Also do for 64-bit.
def : Pat<
(add i32:$src0, (i32 NegSubInlineConst32:$src1)),
(S_SUB_I32 $src0, NegSubInlineConst32:$src1)
>;
def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
@ -1235,14 +1243,11 @@ class FPMed3Pat<ValueType vt,
def : FPMed3Pat<f32, V_MED3_F32>;
// Undo sub x, c -> add x, -c canonicalization since c is more likely
// an inline immediate than -c.
// TODO: Also do for 64-bit.
def : Pat<
(add i32:$src0, (i32 NegSubInlineConst32:$src1)),
(S_SUB_I32 $src0, NegSubInlineConst32:$src1)
>;
let Predicates = [isGFX9] in {
def : FPMed3Pat<f16, V_MED3_F16>;
def : IntMed3Pat<V_MED3_I16, smax, smax_oneuse, smin_oneuse, i16>;
def : IntMed3Pat<V_MED3_U16, umax, umax_oneuse, umin_oneuse, i16>;
} // End Predicates = [isGFX9]
//============================================================================//
// Assembler aliases

View File

@ -258,8 +258,8 @@ def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>;
let Predicates = [isVI] in {
multiclass Tenary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
Instruction inst, SDPatternOperator op3> {
multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
Instruction inst, SDPatternOperator op3> {
def : Pat<
(op2 (op1 i16:$src0, i16:$src1), i16:$src2),
(inst i16:$src0, i16:$src1, i16:$src2)
@ -278,8 +278,8 @@ def : Pat<
>;
}
defm: Tenary_i16_Pats<mul, add, V_MAD_U16, zext>;
defm: Tenary_i16_Pats<mul, add, V_MAD_I16, sext>;
defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>;
defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>;
} // End Predicates = [isVI]
@ -291,6 +291,10 @@ def V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
def V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
def V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
def V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
def V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmed3>;
def V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmed3>;
def V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumed3>;
}
@ -487,3 +491,7 @@ defm V_LSHL_OR_B32 : VOP3_Real_vi <0x200>;
defm V_AND_OR_B32 : VOP3_Real_vi <0x201>;
defm V_OR3_B32 : VOP3_Real_vi <0x202>;
defm V_PACK_B32_F16 : VOP3_Real_vi <0x2a0>;
defm V_MED3_F16 : VOP3_Real_vi <0x1fa>;
defm V_MED3_I16 : VOP3_Real_vi <0x1fb>;
defm V_MED3_U16 : VOP3_Real_vi <0x1fc>;

View File

@ -1,5 +1,10 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f32:
; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v{{[0-9]+}}
@ -688,8 +693,8 @@ define void @v_test_global_nnans_med3_f32_pat15(float addrspace(1)* %out, float
; ---------------------------------------------------------------------
; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use0:
; GCN: v_min_f32
; GCN: v_max_f32
; GCN-DAG: v_min_f32
; GCN-DAG: v_max_f32
; GCN: v_min_f32
; GCN: v_max_f32
define void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
@ -884,12 +889,86 @@ define void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float add
ret void
}
; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f16:
; SI: v_cvt_f32_f16
; SI: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
; SI: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
; SI: v_cvt_f16_f32
; VI: v_add_f16_e32 v{{[0-9]+}}, 1.0
; VI: v_max_f16_e32 v{{[0-9]+}}, 2.0
; VI: v_min_f16_e32 v{{[0-9]+}}, 4.0
; GFX9: v_add_f16_e32 v{{[0-9]+}}, 1.0
; GFX9: v_med3_f16 v{{[0-9]+}}, [[ADD]], 2.0, 4.0
define void @v_test_nnan_input_fmed3_r_i_i_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr half, half addrspace(1)* %out, i32 %tid
%a = load half, half addrspace(1)* %gep0
%a.add = fadd nnan half %a, 1.0
%max = call half @llvm.maxnum.f16(half %a.add, half 2.0)
%med = call half @llvm.minnum.f16(half %max, half 4.0)
store half %med, half addrspace(1)* %outgep
ret void
}
; GCN-LABEL: {{^}}v_nnan_inputs_med3_f16_pat0:
; GCN: {{buffer_|flat_}}load_ushort [[A:v[0-9]+]]
; GCN: {{buffer_|flat_}}load_ushort [[B:v[0-9]+]]
; GCN: {{buffer_|flat_}}load_ushort [[C:v[0-9]+]]
; SI: v_cvt_f32_f16
; SI: v_cvt_f32_f16
; SI: v_add_f32_e32
; SI: v_add_f32_e32
; SI: v_add_f32_e32
; SI: v_med3_f32
; SI: v_cvt_f16_f32_e32
; GFX89-DAG: v_add_f16_e32 [[A_ADD:v[0-9]+]], 1.0, [[A]]
; GFX89-DAG: v_add_f16_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]]
; GFX89-DAG: v_add_f16_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]]
; VI-DAG: v_min_f16
; VI-DAG: v_max_f16
; VI: v_min_f16
; VI: v_max_f16
; GFX9: v_med3_f16 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]]
define void @v_nnan_inputs_med3_f16_pat0(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
%gep1 = getelementptr half, half addrspace(1)* %bptr, i32 %tid
%gep2 = getelementptr half, half addrspace(1)* %cptr, i32 %tid
%outgep = getelementptr half, half addrspace(1)* %out, i32 %tid
%a = load volatile half, half addrspace(1)* %gep0
%b = load volatile half, half addrspace(1)* %gep1
%c = load volatile half, half addrspace(1)* %gep2
%a.nnan = fadd nnan half %a, 1.0
%b.nnan = fadd nnan half %b, 2.0
%c.nnan = fadd nnan half %c, 4.0
%tmp0 = call half @llvm.minnum.f16(half %a.nnan, half %b.nnan)
%tmp1 = call half @llvm.maxnum.f16(half %a.nnan, half %b.nnan)
%tmp2 = call half @llvm.minnum.f16(half %tmp1, half %c.nnan)
%med3 = call half @llvm.maxnum.f16(half %tmp0, half %tmp2)
store half %med3, half addrspace(1)* %outgep
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x() #0
declare float @llvm.fabs.f32(float) #0
declare float @llvm.minnum.f32(float, float) #0
declare float @llvm.maxnum.f32(float, float) #0
declare double @llvm.minnum.f64(double, double) #0
declare double @llvm.maxnum.f64(double, double) #0
declare half @llvm.fabs.f16(half) #0
declare half @llvm.minnum.f16(half, half) #0
declare half @llvm.maxnum.f16(half, half) #0
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }

View File

@ -0,0 +1,39 @@
; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}test_fmed3_f16:
; GCN: v_med3_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define void @test_fmed3_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
%src0.f16 = trunc i32 %src0.arg to i16
%src0 = bitcast i16 %src0.f16 to half
%src1.f16 = trunc i32 %src1.arg to i16
%src1 = bitcast i16 %src1.f16 to half
%src2.f16 = trunc i32 %src2.arg to i16
%src2 = bitcast i16 %src2.f16 to half
%mad = call half @llvm.amdgcn.fmed3.f16(half %src0, half %src1, half %src2)
store half %mad, half addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}test_fmed3_srcmods_f16:
; GCN: v_med3_f16 v{{[0-9]+}}, -s{{[0-9]+}}, |v{{[0-9]+}}|, -|v{{[0-9]+}}|
define void @test_fmed3_srcmods_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
%src0.f16 = trunc i32 %src0.arg to i16
%src0 = bitcast i16 %src0.f16 to half
%src1.f16 = trunc i32 %src1.arg to i16
%src1 = bitcast i16 %src1.f16 to half
%src2.f16 = trunc i32 %src2.arg to i16
%src2 = bitcast i16 %src2.f16 to half
%src0.fneg = fsub half -0.0, %src0
%src1.fabs = call half @llvm.fabs.f16(half %src1)
%src2.fabs = call half @llvm.fabs.f16(half %src2)
%src2.fneg.fabs = fsub half -0.0, %src2.fabs
%mad = call half @llvm.amdgcn.fmed3.f16(half %src0.fneg, half %src1.fabs, half %src2.fneg.fabs)
store half %mad, half addrspace(1)* %out
ret void
}
declare half @llvm.amdgcn.fmed3.f16(half, half, half) #0
declare half @llvm.fabs.f16(half) #0
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }

View File

@ -1,12 +1,13 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VI %s
; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
declare i32 @llvm.r600.read.tidig.x() #0
declare i32 @llvm.amdgcn.workitem.id.x() #0
; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i32:
; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
define void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
%tid = call i32 @llvm.r600.read.tidig.x()
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load i32, i32 addrspace(1)* %gep0
@ -25,7 +26,7 @@ define void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a
; GCN: v_max_i32
; GCN: v_min_i32
define void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
%tid = call i32 @llvm.r600.read.tidig.x()
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load i32, i32 addrspace(1)* %gep0
@ -45,7 +46,7 @@ define void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrsp
; GCN: v_max_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
; GCN: v_min_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
define void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
%tid = call i32 @llvm.r600.read.tidig.x()
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load i32, i32 addrspace(1)* %gep0
@ -64,7 +65,7 @@ define void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 a
; GCN: v_max_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
; GCN: v_min_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
define void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
%tid = call i32 @llvm.r600.read.tidig.x()
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load i32, i32 addrspace(1)* %gep0
@ -83,7 +84,7 @@ define void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 ad
; GCN: v_cmp_lt_i64
; GCN: v_cmp_gt_i64
define void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
%tid = call i32 @llvm.r600.read.tidig.x()
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
%a = load i64, i64 addrspace(1)* %gep0
@ -99,9 +100,10 @@ define void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a
}
; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i16:
; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
; SICIVI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
define void @v_test_smed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
%tid = call i32 @llvm.r600.read.tidig.x()
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
%a = load i16, i16 addrspace(1)* %gep0
@ -362,6 +364,7 @@ bb:
ret void
}
; FIXME: Should keep scalar or not promote
; GCN-LABEL: {{^}}s_test_smed3_i16_pat_0:
; GCN: s_sext_i32_i16
; GCN: s_sext_i32_i16
@ -444,6 +447,35 @@ bb:
ret void
}
; GCN-LABEL: {{^}}v_test_smed3_i16_pat_0:
; SI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; FIXME: VI not matching med3
; VI: v_min_i16
; VI: v_max_i16
; VI: v_min_i16
; VI: v_max_i16
; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define void @v_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
bb:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid
%gep1 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 3
%gep2 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 8
%out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
%x = load i16, i16 addrspace(1)* %gep0
%y = load i16, i16 addrspace(1)* %gep1
%z = load i16, i16 addrspace(1)* %gep2
%tmp0 = call i16 @smin16(i16 %x, i16 %y)
%tmp1 = call i16 @smax16(i16 %x, i16 %y)
%tmp2 = call i16 @smin16(i16 %tmp1, i16 %z)
%tmp3 = call i16 @smax16(i16 %tmp0, i16 %tmp2)
store i16 %tmp3, i16 addrspace(1)* %out.gep
ret void
}
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
attributes #2 = { nounwind readnone alwaysinline }

View File

@ -1,12 +1,13 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VI %s
; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
declare i32 @llvm.r600.read.tidig.x() #0
declare i32 @llvm.amdgcn.workitem.id.x() #0
; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i32:
; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
define void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
%tid = call i32 @llvm.r600.read.tidig.x()
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load i32, i32 addrspace(1)* %gep0
@ -25,7 +26,7 @@ define void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a
; GCN: v_max_u32
; GCN: v_min_u32
define void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
%tid = call i32 @llvm.r600.read.tidig.x()
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load i32, i32 addrspace(1)* %gep0
@ -45,7 +46,7 @@ define void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrsp
; GCN: v_max_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
; GCN: v_min_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
define void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
%tid = call i32 @llvm.r600.read.tidig.x()
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load i32, i32 addrspace(1)* %gep0
@ -64,7 +65,7 @@ define void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 a
; GCN: v_max_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
; GCN: v_min_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
define void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
%tid = call i32 @llvm.r600.read.tidig.x()
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
%a = load i32, i32 addrspace(1)* %gep0
@ -83,7 +84,7 @@ define void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 ad
; GCN: v_cmp_lt_u64
; GCN: v_cmp_gt_u64
define void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
%tid = call i32 @llvm.r600.read.tidig.x()
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
%a = load i64, i64 addrspace(1)* %gep0
@ -99,9 +100,10 @@ define void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a
}
; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i16:
; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
; SICIVI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
define void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
%tid = call i32 @llvm.r600.read.tidig.x()
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
%outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
%a = load i16, i16 addrspace(1)* %gep0
@ -479,6 +481,35 @@ bb:
ret void
}
; GCN-LABEL: {{^}}v_test_umed3_i16_pat_0:
; SI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; FIXME: VI not matching med3
; VI: v_min_u16
; VI: v_max_u16
; VI: v_min_u16
; VI: v_max_u16
; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define void @v_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
bb:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid
%gep1 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 3
%gep2 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 8
%out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
%x = load i16, i16 addrspace(1)* %gep0
%y = load i16, i16 addrspace(1)* %gep1
%z = load i16, i16 addrspace(1)* %gep2
%tmp0 = call i16 @umin16(i16 %x, i16 %y)
%tmp1 = call i16 @umax16(i16 %x, i16 %y)
%tmp2 = call i16 @umin16(i16 %tmp1, i16 %z)
%tmp3 = call i16 @umax16(i16 %tmp0, i16 %tmp2)
store i16 %tmp3, i16 addrspace(1)* %out.gep
ret void
}
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
attributes #2 = { nounwind readnone alwaysinline }

View File

@ -30,3 +30,15 @@ v_or3_b32 v1, v2, v3, v4
v_pack_b32_f16 v1, v2, v3
// GFX9: v_pack_b32_f16 v1, v2, v3 ; encoding: [0x01,0x00,0xa0,0xd2,0x02,0x07,0x02,0x00]
// NOVI: :1: error: instruction not supported on this GPU
v_med3_f16 v1, v2, v3, v4
// GFX9: v_med3_f16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xfa,0xd1,0x02,0x07,0x12,0x04]
// NOVI: :1: error: instruction not supported on this GPU
v_med3_i16 v1, v2, v3, v4
// GFX9: v_med3_i16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xfb,0xd1,0x02,0x07,0x12,0x04]
// NOVI: :1: error: instruction not supported on this GPU
v_med3_u16 v1, v2, v3, v4
// GFX9: v_med3_u16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xfc,0xd1,0x02,0x07,0x12,0x04]
// NOVI: :1: error: instruction not supported on this GPU