mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 11:13:28 +01:00
R600: Use optimized 24bit path in udivrem
v2: drop enum keyword use correct extension mode don't bother computing the sign in unsinged case Signed-off-by: Jan Vesely <jan.vesely@rutgers.edu> llvm-svn: 215462
This commit is contained in:
parent
a72063b855
commit
c9798145af
@ -1387,7 +1387,7 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
|
||||
// This is a shortcut for integer division because we have fast i32<->f32
|
||||
// conversions, and fast f32 reciprocal instructions. The fractional part of a
|
||||
// float is enough to accurately represent up to a 24-bit integer.
|
||||
SDValue AMDGPUTargetLowering::LowerSDIVREM24(SDValue Op, SelectionDAG &DAG) const {
|
||||
SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const {
|
||||
SDLoc DL(Op);
|
||||
EVT VT = Op.getValueType();
|
||||
SDValue LHS = Op.getOperand(0);
|
||||
@ -1395,6 +1395,9 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM24(SDValue Op, SelectionDAG &DAG) cons
|
||||
MVT IntVT = MVT::i32;
|
||||
MVT FltVT = MVT::f32;
|
||||
|
||||
ISD::NodeType ToFp = sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
|
||||
ISD::NodeType ToInt = sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
|
||||
|
||||
if (VT.isVector()) {
|
||||
unsigned NElts = VT.getVectorNumElements();
|
||||
IntVT = MVT::getVectorVT(MVT::i32, NElts);
|
||||
@ -1403,29 +1406,35 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM24(SDValue Op, SelectionDAG &DAG) cons
|
||||
|
||||
unsigned BitSize = VT.getScalarType().getSizeInBits();
|
||||
|
||||
// char|short jq = ia ^ ib;
|
||||
SDValue jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
|
||||
SDValue jq = DAG.getConstant(1, IntVT);
|
||||
|
||||
// jq = jq >> (bitsize - 2)
|
||||
jq = DAG.getNode(ISD::SRA, DL, VT, jq, DAG.getConstant(BitSize - 2, VT));
|
||||
if (sign) {
|
||||
// char|short jq = ia ^ ib;
|
||||
jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
|
||||
|
||||
// jq = jq | 0x1
|
||||
jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, VT));
|
||||
// jq = jq >> (bitsize - 2)
|
||||
jq = DAG.getNode(ISD::SRA, DL, VT, jq, DAG.getConstant(BitSize - 2, VT));
|
||||
|
||||
// jq = (int)jq
|
||||
jq = DAG.getSExtOrTrunc(jq, DL, IntVT);
|
||||
// jq = jq | 0x1
|
||||
jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, VT));
|
||||
|
||||
// jq = (int)jq
|
||||
jq = DAG.getSExtOrTrunc(jq, DL, IntVT);
|
||||
}
|
||||
|
||||
// int ia = (int)LHS;
|
||||
SDValue ia = DAG.getSExtOrTrunc(LHS, DL, IntVT);
|
||||
SDValue ia = sign ?
|
||||
DAG.getSExtOrTrunc(LHS, DL, IntVT) : DAG.getZExtOrTrunc(LHS, DL, IntVT);
|
||||
|
||||
// int ib, (int)RHS;
|
||||
SDValue ib = DAG.getSExtOrTrunc(RHS, DL, IntVT);
|
||||
SDValue ib = sign ?
|
||||
DAG.getSExtOrTrunc(RHS, DL, IntVT) : DAG.getZExtOrTrunc(RHS, DL, IntVT);
|
||||
|
||||
// float fa = (float)ia;
|
||||
SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FltVT, ia);
|
||||
SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
|
||||
|
||||
// float fb = (float)ib;
|
||||
SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FltVT, ib);
|
||||
SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
|
||||
|
||||
// float fq = native_divide(fa, fb);
|
||||
SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
|
||||
@ -1442,7 +1451,7 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM24(SDValue Op, SelectionDAG &DAG) cons
|
||||
DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa);
|
||||
|
||||
// int iq = (int)fq;
|
||||
SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, IntVT, fq);
|
||||
SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
|
||||
|
||||
// fr = fabs(fr);
|
||||
fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
|
||||
@ -1458,11 +1467,13 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM24(SDValue Op, SelectionDAG &DAG) cons
|
||||
// jq = (cv ? jq : 0);
|
||||
jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, VT));
|
||||
|
||||
// dst = iq + jq;
|
||||
iq = DAG.getSExtOrTrunc(iq, DL, VT);
|
||||
// dst = trunc/extend to legal type
|
||||
iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT);
|
||||
|
||||
// dst = iq + jq;
|
||||
SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
|
||||
|
||||
// Rem needs compensation, it's easier to recompute it
|
||||
SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
|
||||
Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
|
||||
|
||||
@ -1481,6 +1492,16 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
|
||||
SDValue Num = Op.getOperand(0);
|
||||
SDValue Den = Op.getOperand(1);
|
||||
|
||||
if (VT == MVT::i32) {
|
||||
if (DAG.MaskedValueIsZero(Op.getOperand(0), APInt(32, 0xff << 24)) &&
|
||||
DAG.MaskedValueIsZero(Op.getOperand(1), APInt(32, 0xff << 24))) {
|
||||
// TODO: We technically could do this for i64, but shouldn't that just be
|
||||
// handled by something generally reducing 64-bit division on 32-bit
|
||||
// values to 32-bit?
|
||||
return LowerDIVREM24(Op, DAG, false);
|
||||
}
|
||||
}
|
||||
|
||||
// RCP = URECIP(Den) = 2^32 / Den + e
|
||||
// e is rounding error.
|
||||
SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
|
||||
@ -1591,7 +1612,7 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
|
||||
// TODO: We technically could do this for i64, but shouldn't that just be
|
||||
// handled by something generally reducing 64-bit division on 32-bit
|
||||
// values to 32-bit?
|
||||
return LowerSDIVREM24(Op, DAG);
|
||||
return LowerDIVREM24(Op, DAG, true);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -82,7 +82,7 @@ protected:
|
||||
SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerSDIVREM24(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const;
|
||||
bool isHWTrueValue(SDValue Op) const;
|
||||
bool isHWFalseValue(SDValue Op) const;
|
||||
|
||||
|
244
test/CodeGen/R600/udivrem24.ll
Normal file
244
test/CodeGen/R600/udivrem24.ll
Normal file
@ -0,0 +1,244 @@
|
||||
; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
||||
|
||||
; FUNC-LABEL: @udiv24_i8
|
||||
; SI: V_CVT_F32_UBYTE
|
||||
; SI: V_CVT_F32_UBYTE
|
||||
; SI: V_RCP_F32
|
||||
; SI: V_CVT_U32_F32
|
||||
|
||||
; EG: UINT_TO_FLT
|
||||
; EG-DAG: UINT_TO_FLT
|
||||
; EG-DAG: RECIP_IEEE
|
||||
; EG: FLT_TO_UINT
|
||||
define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
|
||||
%den_ptr = getelementptr i8 addrspace(1)* %in, i8 1
|
||||
%num = load i8 addrspace(1) * %in
|
||||
%den = load i8 addrspace(1) * %den_ptr
|
||||
%result = udiv i8 %num, %den
|
||||
store i8 %result, i8 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: @udiv24_i16
|
||||
; SI: V_CVT_F32_U32
|
||||
; SI: V_CVT_F32_U32
|
||||
; SI: V_RCP_F32
|
||||
; SI: V_CVT_U32_F32
|
||||
|
||||
; EG: UINT_TO_FLT
|
||||
; EG-DAG: UINT_TO_FLT
|
||||
; EG-DAG: RECIP_IEEE
|
||||
; EG: FLT_TO_UINT
|
||||
define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
|
||||
%den_ptr = getelementptr i16 addrspace(1)* %in, i16 1
|
||||
%num = load i16 addrspace(1) * %in, align 2
|
||||
%den = load i16 addrspace(1) * %den_ptr, align 2
|
||||
%result = udiv i16 %num, %den
|
||||
store i16 %result, i16 addrspace(1)* %out, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: @udiv24_i32
|
||||
; SI: V_CVT_F32_U32
|
||||
; SI-DAG: V_CVT_F32_U32
|
||||
; SI-DAG: V_RCP_F32
|
||||
; SI: V_CVT_U32_F32
|
||||
|
||||
; EG: UINT_TO_FLT
|
||||
; EG-DAG: UINT_TO_FLT
|
||||
; EG-DAG: RECIP_IEEE
|
||||
; EG: FLT_TO_UINT
|
||||
define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
|
||||
%den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
|
||||
%num = load i32 addrspace(1) * %in, align 4
|
||||
%den = load i32 addrspace(1) * %den_ptr, align 4
|
||||
%num.i24.0 = shl i32 %num, 8
|
||||
%den.i24.0 = shl i32 %den, 8
|
||||
%num.i24 = lshr i32 %num.i24.0, 8
|
||||
%den.i24 = lshr i32 %den.i24.0, 8
|
||||
%result = udiv i32 %num.i24, %den.i24
|
||||
store i32 %result, i32 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: @udiv25_i32
|
||||
; RCP_IFLAG is for URECIP in the full 32b alg
|
||||
; SI: V_RCP_IFLAG
|
||||
; SI-NOT: V_RCP_F32
|
||||
|
||||
; EG-NOT: UINT_TO_FLT
|
||||
; EG-NOT: RECIP_IEEE
|
||||
define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
|
||||
%den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
|
||||
%num = load i32 addrspace(1) * %in, align 4
|
||||
%den = load i32 addrspace(1) * %den_ptr, align 4
|
||||
%num.i24.0 = shl i32 %num, 7
|
||||
%den.i24.0 = shl i32 %den, 7
|
||||
%num.i24 = lshr i32 %num.i24.0, 7
|
||||
%den.i24 = lshr i32 %den.i24.0, 7
|
||||
%result = udiv i32 %num.i24, %den.i24
|
||||
store i32 %result, i32 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: @test_no_udiv24_i32_1
|
||||
; RCP_IFLAG is for URECIP in the full 32b alg
|
||||
; SI: V_RCP_IFLAG
|
||||
; SI-NOT: V_RCP_F32
|
||||
|
||||
; EG-NOT: UINT_TO_FLT
|
||||
; EG-NOT: RECIP_IEEE
|
||||
define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
|
||||
%den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
|
||||
%num = load i32 addrspace(1) * %in, align 4
|
||||
%den = load i32 addrspace(1) * %den_ptr, align 4
|
||||
%num.i24.0 = shl i32 %num, 8
|
||||
%den.i24.0 = shl i32 %den, 7
|
||||
%num.i24 = lshr i32 %num.i24.0, 8
|
||||
%den.i24 = lshr i32 %den.i24.0, 7
|
||||
%result = udiv i32 %num.i24, %den.i24
|
||||
store i32 %result, i32 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: @test_no_udiv24_i32_2
|
||||
; RCP_IFLAG is for URECIP in the full 32b alg
|
||||
; SI: V_RCP_IFLAG
|
||||
; SI-NOT: V_RCP_F32
|
||||
|
||||
; EG-NOT: UINT_TO_FLT
|
||||
; EG-NOT: RECIP_IEEE
|
||||
define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
|
||||
%den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
|
||||
%num = load i32 addrspace(1) * %in, align 4
|
||||
%den = load i32 addrspace(1) * %den_ptr, align 4
|
||||
%num.i24.0 = shl i32 %num, 7
|
||||
%den.i24.0 = shl i32 %den, 8
|
||||
%num.i24 = lshr i32 %num.i24.0, 7
|
||||
%den.i24 = lshr i32 %den.i24.0, 8
|
||||
%result = udiv i32 %num.i24, %den.i24
|
||||
store i32 %result, i32 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: @urem24_i8
|
||||
; SI: V_CVT_F32_UBYTE
|
||||
; SI: V_CVT_F32_UBYTE
|
||||
; SI: V_RCP_F32
|
||||
; SI: V_CVT_U32_F32
|
||||
|
||||
; EG: UINT_TO_FLT
|
||||
; EG-DAG: UINT_TO_FLT
|
||||
; EG-DAG: RECIP_IEEE
|
||||
; EG: FLT_TO_UINT
|
||||
define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
|
||||
%den_ptr = getelementptr i8 addrspace(1)* %in, i8 1
|
||||
%num = load i8 addrspace(1) * %in
|
||||
%den = load i8 addrspace(1) * %den_ptr
|
||||
%result = urem i8 %num, %den
|
||||
store i8 %result, i8 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: @urem24_i16
|
||||
; SI: V_CVT_F32_U32
|
||||
; SI: V_CVT_F32_U32
|
||||
; SI: V_RCP_F32
|
||||
; SI: V_CVT_U32_F32
|
||||
|
||||
; EG: UINT_TO_FLT
|
||||
; EG-DAG: UINT_TO_FLT
|
||||
; EG-DAG: RECIP_IEEE
|
||||
; EG: FLT_TO_UINT
|
||||
define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
|
||||
%den_ptr = getelementptr i16 addrspace(1)* %in, i16 1
|
||||
%num = load i16 addrspace(1) * %in, align 2
|
||||
%den = load i16 addrspace(1) * %den_ptr, align 2
|
||||
%result = urem i16 %num, %den
|
||||
store i16 %result, i16 addrspace(1)* %out, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: @urem24_i32
|
||||
; SI: V_CVT_F32_U32
|
||||
; SI: V_CVT_F32_U32
|
||||
; SI: V_RCP_F32
|
||||
; SI: V_CVT_U32_F32
|
||||
|
||||
; EG: UINT_TO_FLT
|
||||
; EG-DAG: UINT_TO_FLT
|
||||
; EG-DAG: RECIP_IEEE
|
||||
; EG: FLT_TO_UINT
|
||||
define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
|
||||
%den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
|
||||
%num = load i32 addrspace(1) * %in, align 4
|
||||
%den = load i32 addrspace(1) * %den_ptr, align 4
|
||||
%num.i24.0 = shl i32 %num, 8
|
||||
%den.i24.0 = shl i32 %den, 8
|
||||
%num.i24 = lshr i32 %num.i24.0, 8
|
||||
%den.i24 = lshr i32 %den.i24.0, 8
|
||||
%result = urem i32 %num.i24, %den.i24
|
||||
store i32 %result, i32 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: @urem25_i32
|
||||
; RCP_IFLAG is for URECIP in the full 32b alg
|
||||
; SI: V_RCP_IFLAG
|
||||
; SI-NOT: V_RCP_F32
|
||||
|
||||
; EG-NOT: UINT_TO_FLT
|
||||
; EG-NOT: RECIP_IEEE
|
||||
define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
|
||||
%den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
|
||||
%num = load i32 addrspace(1) * %in, align 4
|
||||
%den = load i32 addrspace(1) * %den_ptr, align 4
|
||||
%num.i24.0 = shl i32 %num, 7
|
||||
%den.i24.0 = shl i32 %den, 7
|
||||
%num.i24 = lshr i32 %num.i24.0, 7
|
||||
%den.i24 = lshr i32 %den.i24.0, 7
|
||||
%result = urem i32 %num.i24, %den.i24
|
||||
store i32 %result, i32 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: @test_no_urem24_i32_1
|
||||
; RCP_IFLAG is for URECIP in the full 32b alg
|
||||
; SI: V_RCP_IFLAG
|
||||
; SI-NOT: V_RCP_F32
|
||||
|
||||
; EG-NOT: UINT_TO_FLT
|
||||
; EG-NOT: RECIP_IEEE
|
||||
define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
|
||||
%den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
|
||||
%num = load i32 addrspace(1) * %in, align 4
|
||||
%den = load i32 addrspace(1) * %den_ptr, align 4
|
||||
%num.i24.0 = shl i32 %num, 8
|
||||
%den.i24.0 = shl i32 %den, 7
|
||||
%num.i24 = lshr i32 %num.i24.0, 8
|
||||
%den.i24 = lshr i32 %den.i24.0, 7
|
||||
%result = urem i32 %num.i24, %den.i24
|
||||
store i32 %result, i32 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: @test_no_urem24_i32_2
|
||||
; RCP_IFLAG is for URECIP in the full 32b alg
|
||||
; SI: V_RCP_IFLAG
|
||||
; SI-NOT: V_RCP_F32
|
||||
|
||||
; EG-NOT: UINT_TO_FLT
|
||||
; EG-NOT: RECIP_IEEE
|
||||
define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
|
||||
%den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
|
||||
%num = load i32 addrspace(1) * %in, align 4
|
||||
%den = load i32 addrspace(1) * %den_ptr, align 4
|
||||
%num.i24.0 = shl i32 %num, 7
|
||||
%den.i24.0 = shl i32 %den, 8
|
||||
%num.i24 = lshr i32 %num.i24.0, 7
|
||||
%den.i24 = lshr i32 %den.i24.0, 8
|
||||
%result = urem i32 %num.i24, %den.i24
|
||||
store i32 %result, i32 addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
Loading…
Reference in New Issue
Block a user