mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-22 18:54:02 +01:00
[X86][SchedModel] SSE reciprocal square root instruction latencies.
The SSE rsqrt instruction (a fast reciprocal square root estimate) was grouped in the same scheduling IIC_SSE_SQRT* class as the accurate (but very slow) SSE sqrt instruction. For code which uses rsqrt (possibly with newton-raphson iterations) this poor scheduling was affecting performances. This patch splits off the rsqrt instruction from the sqrt instruction scheduling classes and creates new IIC_SSE_RSQER* classes with latency values based on Agner's table. Differential Revision: http://reviews.llvm.org/D5370 Patch by Simon Pilgrim. llvm-svn: 218517
This commit is contained in:
parent
2e26931cef
commit
c61458a223
@ -3344,6 +3344,16 @@ def SSE_SQRTSD : OpndItins<
|
||||
>;
|
||||
}
|
||||
|
||||
let Sched = WriteFRsqrt in {
|
||||
def SSE_RSQRTPS : OpndItins<
|
||||
IIC_SSE_RSQRTPS_RR, IIC_SSE_RSQRTPS_RM
|
||||
>;
|
||||
|
||||
def SSE_RSQRTSS : OpndItins<
|
||||
IIC_SSE_RSQRTSS_RR, IIC_SSE_RSQRTSS_RM
|
||||
>;
|
||||
}
|
||||
|
||||
let Sched = WriteFRcp in {
|
||||
def SSE_RCPP : OpndItins<
|
||||
IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
|
||||
@ -3622,10 +3632,10 @@ defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss,
|
||||
|
||||
// Reciprocal approximations. Note that these typically require refinement
|
||||
// in order to obtain suitable precision.
|
||||
defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTSS>,
|
||||
sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTPS>,
|
||||
defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
|
||||
sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS>,
|
||||
sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
|
||||
int_x86_avx_rsqrt_ps_256, SSE_SQRTPS>;
|
||||
int_x86_avx_rsqrt_ps_256, SSE_RSQRTPS>;
|
||||
defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>,
|
||||
sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>,
|
||||
sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,
|
||||
|
@ -129,6 +129,7 @@ defm : HWWriteResPair<WriteFAdd, HWPort1, 3>;
|
||||
defm : HWWriteResPair<WriteFMul, HWPort0, 5>;
|
||||
defm : HWWriteResPair<WriteFDiv, HWPort0, 12>; // 10-14 cycles.
|
||||
defm : HWWriteResPair<WriteFRcp, HWPort0, 5>;
|
||||
defm : HWWriteResPair<WriteFRsqrt, HWPort0, 5>;
|
||||
defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>;
|
||||
defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>;
|
||||
defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>;
|
||||
|
@ -117,6 +117,7 @@ defm : SBWriteResPair<WriteFAdd, SBPort1, 3>;
|
||||
defm : SBWriteResPair<WriteFMul, SBPort0, 5>;
|
||||
defm : SBWriteResPair<WriteFDiv, SBPort0, 12>; // 10-14 cycles.
|
||||
defm : SBWriteResPair<WriteFRcp, SBPort0, 5>;
|
||||
defm : SBWriteResPair<WriteFRsqrt, SBPort0, 5>;
|
||||
defm : SBWriteResPair<WriteFSqrt, SBPort0, 15>;
|
||||
defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>;
|
||||
defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>;
|
||||
|
@ -63,12 +63,13 @@ def WriteZero : SchedWrite;
|
||||
defm WriteJump : X86SchedWritePair;
|
||||
|
||||
// Floating point. This covers both scalar and vector operations.
|
||||
defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare.
|
||||
defm WriteFMul : X86SchedWritePair; // Floating point multiplication.
|
||||
defm WriteFDiv : X86SchedWritePair; // Floating point division.
|
||||
defm WriteFSqrt : X86SchedWritePair; // Floating point square root.
|
||||
defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal.
|
||||
defm WriteFMA : X86SchedWritePair; // Fused Multiply Add.
|
||||
defm WriteFAdd : X86SchedWritePair; // Floating point add/sub/compare.
|
||||
defm WriteFMul : X86SchedWritePair; // Floating point multiplication.
|
||||
defm WriteFDiv : X86SchedWritePair; // Floating point division.
|
||||
defm WriteFSqrt : X86SchedWritePair; // Floating point square root.
|
||||
defm WriteFRcp : X86SchedWritePair; // Floating point reciprocal estimate.
|
||||
defm WriteFRsqrt : X86SchedWritePair; // Floating point reciprocal square root estimate.
|
||||
defm WriteFMA : X86SchedWritePair; // Fused Multiply Add.
|
||||
defm WriteFShuffle : X86SchedWritePair; // Floating point vector shuffles.
|
||||
defm WriteFBlend : X86SchedWritePair; // Floating point vector blends.
|
||||
defm WriteFVarBlend : X86SchedWritePair; // Fp vector variable blends.
|
||||
@ -314,6 +315,11 @@ def IIC_SSE_SQRTPD_RM : InstrItinClass;
|
||||
def IIC_SSE_SQRTSD_RR : InstrItinClass;
|
||||
def IIC_SSE_SQRTSD_RM : InstrItinClass;
|
||||
|
||||
def IIC_SSE_RSQRTPS_RR : InstrItinClass;
|
||||
def IIC_SSE_RSQRTPS_RM : InstrItinClass;
|
||||
def IIC_SSE_RSQRTSS_RR : InstrItinClass;
|
||||
def IIC_SSE_RSQRTSS_RM : InstrItinClass;
|
||||
|
||||
def IIC_SSE_RCPP_RR : InstrItinClass;
|
||||
def IIC_SSE_RCPP_RM : InstrItinClass;
|
||||
def IIC_SSE_RCPS_RR : InstrItinClass;
|
||||
|
@ -224,6 +224,11 @@ def AtomItineraries : ProcessorItineraries<
|
||||
InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<62, [Port0, Port1]>] >,
|
||||
InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<62, [Port0, Port1]>] >,
|
||||
|
||||
InstrItinData<IIC_SSE_RSQRTPS_RR, [InstrStage<9, [Port0, Port1]>] >,
|
||||
InstrItinData<IIC_SSE_RSQRTPS_RM, [InstrStage<10, [Port0, Port1]>] >,
|
||||
InstrItinData<IIC_SSE_RSQRTSS_RR, [InstrStage<4, [Port0]>] >,
|
||||
InstrItinData<IIC_SSE_RSQRTSS_RM, [InstrStage<4, [Port0]>] >,
|
||||
|
||||
InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [Port0, Port1]>] >,
|
||||
InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<10, [Port0, Port1]>] >,
|
||||
InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [Port0]>] >,
|
||||
|
@ -163,15 +163,15 @@ defm : JWriteResIntPair<WriteJump, JALU01, 1>;
|
||||
// FIXME: should we bother splitting JFPU pipe + unit stages for fast instructions?
|
||||
// FIXME: Double precision latencies
|
||||
// FIXME: SS vs PS latencies
|
||||
// FIXME: RSQRT latencies
|
||||
// FIXME: ymm latencies
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
defm : JWriteResFpuPair<WriteFAdd, JFPU0, 3>;
|
||||
defm : JWriteResFpuPair<WriteFMul, JFPU1, 2>;
|
||||
defm : JWriteResFpuPair<WriteFRcp, JFPU1, 2>;
|
||||
defm : JWriteResFpuPair<WriteFShuffle, JFPU01, 1>;
|
||||
defm : JWriteResFpuPair<WriteFBlend, JFPU01, 1>;
|
||||
defm : JWriteResFpuPair<WriteFAdd, JFPU0, 3>;
|
||||
defm : JWriteResFpuPair<WriteFMul, JFPU1, 2>;
|
||||
defm : JWriteResFpuPair<WriteFRcp, JFPU1, 2>;
|
||||
defm : JWriteResFpuPair<WriteFRsqrt, JFPU1, 2>;
|
||||
defm : JWriteResFpuPair<WriteFShuffle, JFPU01, 1>;
|
||||
defm : JWriteResFpuPair<WriteFBlend, JFPU01, 1>;
|
||||
defm : JWriteResFpuPair<WriteFShuffle256, JFPU01, 1>;
|
||||
|
||||
def : WriteRes<WriteFSqrt, [JFPU1, JLAGU, JFPM]> {
|
||||
|
@ -101,6 +101,7 @@ def : WriteRes<WriteIDivLd, [MEC_RSV, IEC_RSV01, SMDivider]> {
|
||||
// Scalar and vector floating point.
|
||||
defm : SMWriteResPair<WriteFAdd, FPC_RSV1, 3>;
|
||||
defm : SMWriteResPair<WriteFRcp, FPC_RSV0, 5>;
|
||||
defm : SMWriteResPair<WriteFRsqrt, FPC_RSV0, 5>;
|
||||
defm : SMWriteResPair<WriteFSqrt, FPC_RSV0, 15>;
|
||||
defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>;
|
||||
defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>;
|
||||
|
Loading…
Reference in New Issue
Block a user