1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

Replace the uint64_t -> double convertion algorithm with one that's more efficient.

This small bit of ASM code is sufficient to do what the old algorithm did:

     movq       %rax,  %xmm0
     punpckldq  (c0),  %xmm0  // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
     subpd      (c1),  %xmm0  // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
   #ifdef __SSE3__
     haddpd   %xmm0, %xmm0          
   #else
     pshufd   $0x4e, %xmm0, %xmm1 
     addpd    %xmm1, %xmm0
   #endif

It's arguably faster. One caveat, the 'haddpd' instruction isn't very fast on
all processors.
<rdar://problem/7719814>

llvm-svn: 147593
This commit is contained in:
Bill Wendling 2012-01-05 02:13:20 +00:00
parent f4817ef455
commit 6d5ac8b8df

View File

@ -256,7 +256,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
if (Subtarget->is64Bit()) { if (Subtarget->is64Bit()) {
setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Expand); setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
} else if (!TM.Options.UseSoftFloat) { } else if (!TM.Options.UseSoftFloat) {
// We have an algorithm for SSE2->double, and we turn this into a // We have an algorithm for SSE2->double, and we turn this into a
// 64-bit FILD followed by conditional FADD for other targets. // 64-bit FILD followed by conditional FADD for other targets.
@ -7581,38 +7581,17 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
// LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion. // LowerUINT_TO_FP_i64 - 64-bit unsigned integer to double expansion.
SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op, SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
SelectionDAG &DAG) const { SelectionDAG &DAG) const {
// This algorithm is not obvious. Here it is in C code, more or less: // This algorithm is not obvious. Here it is what we're trying to output:
/* /*
double uint64_to_double( uint32_t hi, uint32_t lo ) { movq %rax, %xmm0
static const __m128i exp = { 0x4330000045300000ULL, 0 }; punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
static const __m128d bias = { 0x1.0p84, 0x1.0p52 }; subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
#ifdef __SSE3__
// Copy ints to xmm registers. haddpd %xmm0, %xmm0
__m128i xh = _mm_cvtsi32_si128( hi ); #else
__m128i xl = _mm_cvtsi32_si128( lo ); pshufd $0x4e, %xmm0, %xmm1
addpd %xmm1, %xmm0
// Combine into low half of a single xmm register. #endif
__m128i x = _mm_unpacklo_epi32( xh, xl );
__m128d d;
double sd;
// Merge in appropriate exponents to give the integer bits the right
// magnitude.
x = _mm_unpacklo_epi32( x, exp );
// Subtract away the biases to deal with the IEEE-754 double precision
// implicit 1.
d = _mm_sub_pd( (__m128d) x, bias );
// All conversions up to here are exact. The correctly rounded result is
// calculated using the current rounding mode using the following
// horizontal add.
d = _mm_add_sd( d, _mm_unpackhi_pd( d, d ) );
_mm_store_sd( &sd, d ); // Because we are returning doubles in XMM, this
// store doesn't really need to be here (except
// maybe to zero the other double)
return sd;
}
*/ */
DebugLoc dl = Op.getDebugLoc(); DebugLoc dl = Op.getDebugLoc();
@ -7620,46 +7599,51 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
// Build some magic constants. // Build some magic constants.
SmallVector<Constant*,4> CV0; SmallVector<Constant*,4> CV0;
CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000))); CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x43300000)));
CV0.push_back(ConstantInt::get(*Context, APInt(32, 0x45300000)));
CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
CV0.push_back(ConstantInt::get(*Context, APInt(32, 0))); CV0.push_back(ConstantInt::get(*Context, APInt(32, 0)));
Constant *C0 = ConstantVector::get(CV0); Constant *C0 = ConstantVector::get(CV0);
SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16); SDValue CPIdx0 = DAG.getConstantPool(C0, getPointerTy(), 16);
SmallVector<Constant*,2> CV1; SmallVector<Constant*,2> CV1;
CV1.push_back(
ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
CV1.push_back( CV1.push_back(
ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL)))); ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
CV1.push_back(
ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
Constant *C1 = ConstantVector::get(CV1); Constant *C1 = ConstantVector::get(CV1);
SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16); SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, // Load the 64-bit value into an XMM register.
DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
Op.getOperand(0), Op.getOperand(0));
DAG.getIntPtrConstant(1)));
SDValue XR2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32,
Op.getOperand(0),
DAG.getIntPtrConstant(0)));
SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32, XR1, XR2);
SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, SDValue CLod0 = DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
MachinePointerInfo::getConstantPool(), MachinePointerInfo::getConstantPool(),
false, false, false, 16); false, false, false, 16);
SDValue Unpck2 = getUnpackl(DAG, dl, MVT::v4i32, Unpck1, CLod0); SDValue Unpck1 = getUnpackl(DAG, dl, MVT::v4i32,
SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck2); DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, XR1),
CLod0);
SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, SDValue CLod1 = DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
MachinePointerInfo::getConstantPool(), MachinePointerInfo::getConstantPool(),
false, false, false, 16); false, false, false, 16);
SDValue XR2F = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Unpck1);
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
SDValue Result;
// Add the halves; easiest way is to swap them into another reg first. if (Subtarget->hasSSE3()) {
int ShufMask[2] = { 1, -1 }; // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
SDValue Shuf = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
DAG.getUNDEF(MVT::v2f64), ShufMask); } else {
SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuf, Sub); SDValue S2F = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Sub);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Add, SDValue Shuffle = getTargetShuffleNode(X86ISD::PSHUFD, dl, MVT::v4i32,
S2F, 0x4E, DAG);
Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Shuffle),
Sub);
}
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
DAG.getIntPtrConstant(0)); DAG.getIntPtrConstant(0));
} }
@ -7729,6 +7713,8 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
return LowerUINT_TO_FP_i64(Op, DAG); return LowerUINT_TO_FP_i64(Op, DAG);
else if (SrcVT == MVT::i32 && X86ScalarSSEf64) else if (SrcVT == MVT::i32 && X86ScalarSSEf64)
return LowerUINT_TO_FP_i32(Op, DAG); return LowerUINT_TO_FP_i32(Op, DAG);
else if (SrcVT == MVT::i64 && DstVT == MVT::f32)
return SDValue();
// Make a 64-bit buffer, and use it to build an FILD. // Make a 64-bit buffer, and use it to build an FILD.
SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64); SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);