diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp index e3234423865..83cb3f5cf25 100644 --- a/lib/Target/R600/AMDGPUISelLowering.cpp +++ b/lib/Target/R600/AMDGPUISelLowering.cpp @@ -1387,7 +1387,7 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // This is a shortcut for integer division because we have fast i32<->f32 // conversions, and fast f32 reciprocal instructions. The fractional part of a // float is enough to accurately represent up to a 24-bit integer. -SDValue AMDGPUTargetLowering::LowerSDIVREM24(SDValue Op, SelectionDAG &DAG) const { +SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const { SDLoc DL(Op); EVT VT = Op.getValueType(); SDValue LHS = Op.getOperand(0); @@ -1395,6 +1395,9 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM24(SDValue Op, SelectionDAG &DAG) cons MVT IntVT = MVT::i32; MVT FltVT = MVT::f32; + ISD::NodeType ToFp = sign ? ISD::SINT_TO_FP : ISD::UINT_TO_FP; + ISD::NodeType ToInt = sign ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; + if (VT.isVector()) { unsigned NElts = VT.getVectorNumElements(); IntVT = MVT::getVectorVT(MVT::i32, NElts); @@ -1403,29 +1406,35 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM24(SDValue Op, SelectionDAG &DAG) cons unsigned BitSize = VT.getScalarType().getSizeInBits(); - // char|short jq = ia ^ ib; - SDValue jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); + SDValue jq = DAG.getConstant(1, IntVT); - // jq = jq >> (bitsize - 2) - jq = DAG.getNode(ISD::SRA, DL, VT, jq, DAG.getConstant(BitSize - 2, VT)); + if (sign) { + // char|short jq = ia ^ ib; + jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); - // jq = jq | 0x1 - jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, VT)); + // jq = jq >> (bitsize - 2) + jq = DAG.getNode(ISD::SRA, DL, VT, jq, DAG.getConstant(BitSize - 2, VT)); - // jq = (int)jq - jq = DAG.getSExtOrTrunc(jq, DL, IntVT); + // jq = jq | 0x1 + jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, VT)); + + // jq = (int)jq + jq = DAG.getSExtOrTrunc(jq, DL, IntVT); + } // int ia = (int)LHS; - SDValue ia = DAG.getSExtOrTrunc(LHS, DL, IntVT); + SDValue ia = sign ? + DAG.getSExtOrTrunc(LHS, DL, IntVT) : DAG.getZExtOrTrunc(LHS, DL, IntVT); // int ib, (int)RHS; - SDValue ib = DAG.getSExtOrTrunc(RHS, DL, IntVT); + SDValue ib = sign ? + DAG.getSExtOrTrunc(RHS, DL, IntVT) : DAG.getZExtOrTrunc(RHS, DL, IntVT); // float fa = (float)ia; - SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FltVT, ia); + SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia); // float fb = (float)ib; - SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FltVT, ib); + SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib); // float fq = native_divide(fa, fb); SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT, @@ -1442,7 +1451,7 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM24(SDValue Op, SelectionDAG &DAG) cons DAG.getNode(ISD::FMUL, DL, FltVT, fqneg, fb), fa); // int iq = (int)fq; - SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, IntVT, fq); + SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq); // fr = fabs(fr); fr = DAG.getNode(ISD::FABS, DL, FltVT, fr); @@ -1458,11 +1467,13 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM24(SDValue Op, SelectionDAG &DAG) cons // jq = (cv ? jq : 0); jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, VT)); - // dst = iq + jq; - iq = DAG.getSExtOrTrunc(iq, DL, VT); + // dst = trunc/extend to legal type + iq = sign ? DAG.getSExtOrTrunc(iq, DL, VT) : DAG.getZExtOrTrunc(iq, DL, VT); + // dst = iq + jq; SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq); + // Rem needs compensation, it's easier to recompute it SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS); Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem); @@ -1481,6 +1492,16 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, SDValue Num = Op.getOperand(0); SDValue Den = Op.getOperand(1); + if (VT == MVT::i32) { + if (DAG.MaskedValueIsZero(Op.getOperand(0), APInt(32, 0xff << 24)) && + DAG.MaskedValueIsZero(Op.getOperand(1), APInt(32, 0xff << 24))) { + // TODO: We technically could do this for i64, but shouldn't that just be + // handled by something generally reducing 64-bit division on 32-bit + // values to 32-bit? + return LowerDIVREM24(Op, DAG, false); + } + } + // RCP = URECIP(Den) = 2^32 / Den + e // e is rounding error. SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); @@ -1591,7 +1612,7 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, // TODO: We technically could do this for i64, but shouldn't that just be // handled by something generally reducing 64-bit division on 32-bit // values to 32-bit? - return LowerSDIVREM24(Op, DAG); + return LowerDIVREM24(Op, DAG, true); } } diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index 574e9b25c7f..fe576a3398d 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -82,7 +82,7 @@ protected: SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerSDIVREM24(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const; bool isHWTrueValue(SDValue Op) const; bool isHWFalseValue(SDValue Op) const; diff --git a/test/CodeGen/R600/udivrem24.ll b/test/CodeGen/R600/udivrem24.ll new file mode 100644 index 00000000000..219c662b7ef --- /dev/null +++ b/test/CodeGen/R600/udivrem24.ll @@ -0,0 +1,244 @@ +; RUN: llc -march=r600 -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s + +; FUNC-LABEL: @udiv24_i8 +; SI: V_CVT_F32_UBYTE +; SI: V_CVT_F32_UBYTE +; SI: V_RCP_F32 +; SI: V_CVT_U32_F32 + +; EG: UINT_TO_FLT +; EG-DAG: UINT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_UINT +define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { + %den_ptr = getelementptr i8 addrspace(1)* %in, i8 1 + %num = load i8 addrspace(1) * %in + %den = load i8 addrspace(1) * %den_ptr + %result = udiv i8 %num, %den + store i8 %result, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @udiv24_i16 +; SI: V_CVT_F32_U32 +; SI: V_CVT_F32_U32 +; SI: V_RCP_F32 +; SI: V_CVT_U32_F32 + +; EG: UINT_TO_FLT +; EG-DAG: UINT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_UINT +define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { + %den_ptr = getelementptr i16 addrspace(1)* %in, i16 1 + %num = load i16 addrspace(1) * %in, align 2 + %den = load i16 addrspace(1) * %den_ptr, align 2 + %result = udiv i16 %num, %den + store i16 %result, i16 addrspace(1)* %out, align 2 + ret void +} + +; FUNC-LABEL: @udiv24_i32 +; SI: V_CVT_F32_U32 +; SI-DAG: V_CVT_F32_U32 +; SI-DAG: V_RCP_F32 +; SI: V_CVT_U32_F32 + +; EG: UINT_TO_FLT +; EG-DAG: UINT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_UINT +define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1 + %num = load i32 addrspace(1) * %in, align 4 + %den = load i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i24.0 = shl i32 %den, 8 + %num.i24 = lshr i32 %num.i24.0, 8 + %den.i24 = lshr i32 %den.i24.0, 8 + %result = udiv i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @udiv25_i32 +; RCP_IFLAG is for URECIP in the full 32b alg +; SI: V_RCP_IFLAG +; SI-NOT: V_RCP_F32 + +; EG-NOT: UINT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1 + %num = load i32 addrspace(1) * %in, align 4 + %den = load i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 7 + %den.i24.0 = shl i32 %den, 7 + %num.i24 = lshr i32 %num.i24.0, 7 + %den.i24 = lshr i32 %den.i24.0, 7 + %result = udiv i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_no_udiv24_i32_1 +; RCP_IFLAG is for URECIP in the full 32b alg +; SI: V_RCP_IFLAG +; SI-NOT: V_RCP_F32 + +; EG-NOT: UINT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1 + %num = load i32 addrspace(1) * %in, align 4 + %den = load i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i24.0 = shl i32 %den, 7 + %num.i24 = lshr i32 %num.i24.0, 8 + %den.i24 = lshr i32 %den.i24.0, 7 + %result = udiv i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_no_udiv24_i32_2 +; RCP_IFLAG is for URECIP in the full 32b alg +; SI: V_RCP_IFLAG +; SI-NOT: V_RCP_F32 + +; EG-NOT: UINT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1 + %num = load i32 addrspace(1) * %in, align 4 + %den = load i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 7 + %den.i24.0 = shl i32 %den, 8 + %num.i24 = lshr i32 %num.i24.0, 7 + %den.i24 = lshr i32 %den.i24.0, 8 + %result = udiv i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @urem24_i8 +; SI: V_CVT_F32_UBYTE +; SI: V_CVT_F32_UBYTE +; SI: V_RCP_F32 +; SI: V_CVT_U32_F32 + +; EG: UINT_TO_FLT +; EG-DAG: UINT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_UINT +define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) { + %den_ptr = getelementptr i8 addrspace(1)* %in, i8 1 + %num = load i8 addrspace(1) * %in + %den = load i8 addrspace(1) * %den_ptr + %result = urem i8 %num, %den + store i8 %result, i8 addrspace(1)* %out + ret void +} + +; FUNC-LABEL: @urem24_i16 +; SI: V_CVT_F32_U32 +; SI: V_CVT_F32_U32 +; SI: V_RCP_F32 +; SI: V_CVT_U32_F32 + +; EG: UINT_TO_FLT +; EG-DAG: UINT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_UINT +define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) { + %den_ptr = getelementptr i16 addrspace(1)* %in, i16 1 + %num = load i16 addrspace(1) * %in, align 2 + %den = load i16 addrspace(1) * %den_ptr, align 2 + %result = urem i16 %num, %den + store i16 %result, i16 addrspace(1)* %out, align 2 + ret void +} + +; FUNC-LABEL: @urem24_i32 +; SI: V_CVT_F32_U32 +; SI: V_CVT_F32_U32 +; SI: V_RCP_F32 +; SI: V_CVT_U32_F32 + +; EG: UINT_TO_FLT +; EG-DAG: UINT_TO_FLT +; EG-DAG: RECIP_IEEE +; EG: FLT_TO_UINT +define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1 + %num = load i32 addrspace(1) * %in, align 4 + %den = load i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i24.0 = shl i32 %den, 8 + %num.i24 = lshr i32 %num.i24.0, 8 + %den.i24 = lshr i32 %den.i24.0, 8 + %result = urem i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @urem25_i32 +; RCP_IFLAG is for URECIP in the full 32b alg +; SI: V_RCP_IFLAG +; SI-NOT: V_RCP_F32 + +; EG-NOT: UINT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1 + %num = load i32 addrspace(1) * %in, align 4 + %den = load i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 7 + %den.i24.0 = shl i32 %den, 7 + %num.i24 = lshr i32 %num.i24.0, 7 + %den.i24 = lshr i32 %den.i24.0, 7 + %result = urem i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_no_urem24_i32_1 +; RCP_IFLAG is for URECIP in the full 32b alg +; SI: V_RCP_IFLAG +; SI-NOT: V_RCP_F32 + +; EG-NOT: UINT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1 + %num = load i32 addrspace(1) * %in, align 4 + %den = load i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 8 + %den.i24.0 = shl i32 %den, 7 + %num.i24 = lshr i32 %num.i24.0, 8 + %den.i24 = lshr i32 %den.i24.0, 7 + %result = urem i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @test_no_urem24_i32_2 +; RCP_IFLAG is for URECIP in the full 32b alg +; SI: V_RCP_IFLAG +; SI-NOT: V_RCP_F32 + +; EG-NOT: UINT_TO_FLT +; EG-NOT: RECIP_IEEE +define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) { + %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1 + %num = load i32 addrspace(1) * %in, align 4 + %den = load i32 addrspace(1) * %den_ptr, align 4 + %num.i24.0 = shl i32 %num, 7 + %den.i24.0 = shl i32 %den, 8 + %num.i24 = lshr i32 %num.i24.0, 7 + %den.i24 = lshr i32 %den.i24.0, 8 + %result = urem i32 %num.i24, %den.i24 + store i32 %result, i32 addrspace(1)* %out, align 4 + ret void +}