1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 11:13:28 +01:00

[X86][BtVer2] Fix latency and throughput of conditional SIMD store instructions.

On BtVer2 conditional SIMD stores are heavily microcoded.
The latency is directly proportional to the number of packed elements extracted
from the input vector. Also, according to micro-benchmarks, most of the
computation seems to be done in the integer unit.

Only a minority of the uOPs is executed by the FPU. The observed behaviour on
the FPU looks similar to this:
 - The input MASK value is moved to the Integer Unit
   -- [ a VMOVMSK-like uOP-executed on JFPU0].
 - In parallel, each element of the input XMM/YMM is extracted and then sent to
   the IntegerUnit through JFPU1.

As expected, a (conditional) store is executed for every extracted element.
Interestingly, a (speculative) load is executed for every extracted element too.
It is as-if a "LOAD - BIT_EXTRACT- CMOV" sequence of uOPs is repeated by the
integer unit for every contionally stored element.
VMASKMOVDQU is a special case: the number of speculative loads is always 2
(presumably, one load per quadword). That means, extra shifts and masking is
performed on (one of) the loaded quadwords before each conditional store (that
also explains the big number of non-FP uOPs retired).

This patch replaces the existing writes for conditional SIMD stores (i.e.
WriteFMaskedStore, and WriteFMaskedStoreY) with the following new writes:

  WriteFMaskedStore32  [ XMM Packed Single ]
  WriteFMaskedStore32Y [ YMM Packed Single ]
  WriteFMaskedStore64  [ XMM Packed Double ]
  WriteFMaskedStore64Y [ YMM Packed Double ]

Added a wrapper class named X86SchedWriteMaskMove in X86Schedule.td to describe
both RM and MR variants for conditional SIMD moves in a single tablegen
definition.
Instances of that class are then passed in input to multiclass avx_movmask_rm
when constructing MASKMOVPS/PD definitions.

Since this patch introduces new writes, I had to update all the X86 scheduling
models.

Differential Revision: https://reviews.llvm.org/D66801

llvm-svn: 370649
This commit is contained in:
Andrea Di Biagio 2019-09-02 12:32:28 +00:00
parent 7698ed73e1
commit 08b9b0a5ac
14 changed files with 114 additions and 43 deletions

View File

@ -7061,27 +7061,29 @@ let Predicates = [HasAVX1Only] in {
// //
multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
Intrinsic IntLd, Intrinsic IntLd256, Intrinsic IntLd, Intrinsic IntLd256,
Intrinsic IntSt, Intrinsic IntSt256> { Intrinsic IntSt, Intrinsic IntSt256,
X86SchedWriteMaskMove schedX,
X86SchedWriteMaskMove schedY> {
def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst), def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, f128mem:$src2), (ins VR128:$src1, f128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>, [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
VEX_4V, Sched<[WriteFMaskedLoad]>; VEX_4V, Sched<[schedX.RM]>;
def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst), def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2), (ins VR256:$src1, f256mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
VEX_4V, VEX_L, Sched<[WriteFMaskedLoadY]>; VEX_4V, VEX_L, Sched<[schedY.RM]>;
def mr : AVX8I<opc_mr, MRMDestMem, (outs), def mr : AVX8I<opc_mr, MRMDestMem, (outs),
(ins f128mem:$dst, VR128:$src1, VR128:$src2), (ins f128mem:$dst, VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>,
VEX_4V, Sched<[WriteFMaskedStore]>; VEX_4V, Sched<[schedX.MR]>;
def Ymr : AVX8I<opc_mr, MRMDestMem, (outs), def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
(ins f256mem:$dst, VR256:$src1, VR256:$src2), (ins f256mem:$dst, VR256:$src1, VR256:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
VEX_4V, VEX_L, Sched<[WriteFMaskedStoreY]>; VEX_4V, VEX_L, Sched<[schedY.MR]>;
} }
let ExeDomain = SSEPackedSingle in let ExeDomain = SSEPackedSingle in
@ -7089,13 +7091,15 @@ defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
int_x86_avx_maskload_ps, int_x86_avx_maskload_ps,
int_x86_avx_maskload_ps_256, int_x86_avx_maskload_ps_256,
int_x86_avx_maskstore_ps, int_x86_avx_maskstore_ps,
int_x86_avx_maskstore_ps_256>; int_x86_avx_maskstore_ps_256,
WriteFMaskMove32, WriteFMaskMove32Y>;
let ExeDomain = SSEPackedDouble in let ExeDomain = SSEPackedDouble in
defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
int_x86_avx_maskload_pd, int_x86_avx_maskload_pd,
int_x86_avx_maskload_pd_256, int_x86_avx_maskload_pd_256,
int_x86_avx_maskstore_pd, int_x86_avx_maskstore_pd,
int_x86_avx_maskstore_pd_256>; int_x86_avx_maskstore_pd_256,
WriteFMaskMove64, WriteFMaskMove64Y>;
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
// VPERMIL - Permute Single and Double Floating-Point Values // VPERMIL - Permute Single and Double Floating-Point Values

View File

@ -232,8 +232,12 @@ defm : X86WriteRes<WriteFStoreY, [BWPort237,BWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNT, [BWPort237,BWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNT, [BWPort237,BWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTX, [BWPort237,BWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTX, [BWPort237,BWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTY, [BWPort237,BWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTY, [BWPort237,BWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFMaskedStore, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
defm : X86WriteRes<WriteFMaskedStoreY, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>; defm : X86WriteRes<WriteFMaskedStore32, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
defm : X86WriteRes<WriteFMaskedStore32Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
defm : X86WriteRes<WriteFMaskedStore64, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
defm : X86WriteRes<WriteFMaskedStore64Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
defm : X86WriteRes<WriteFMove, [BWPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMove, [BWPort5], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveX, [BWPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [BWPort5], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveY, [BWPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [BWPort5], 1, [1], 1>;

View File

@ -231,8 +231,12 @@ defm : X86WriteRes<WriteFStoreY, [HWPort237,HWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNT, [HWPort237,HWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNT, [HWPort237,HWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTX, [HWPort237,HWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTX, [HWPort237,HWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTY, [HWPort237,HWPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTY, [HWPort237,HWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFMaskedStore, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
defm : X86WriteRes<WriteFMaskedStoreY, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>; defm : X86WriteRes<WriteFMaskedStore32, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
defm : X86WriteRes<WriteFMaskedStore32Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
defm : X86WriteRes<WriteFMaskedStore64, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
defm : X86WriteRes<WriteFMaskedStore64Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
defm : X86WriteRes<WriteFMove, [HWPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMove, [HWPort5], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveX, [HWPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [HWPort5], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveY, [HWPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [HWPort5], 1, [1], 1>;

View File

@ -208,8 +208,12 @@ defm : X86WriteRes<WriteFStoreY, [SBPort23,SBPort4], 1, [1,1], 1>;
defm : X86WriteRes<WriteFStoreNT, [SBPort23,SBPort4], 1, [1,1], 1>; defm : X86WriteRes<WriteFStoreNT, [SBPort23,SBPort4], 1, [1,1], 1>;
defm : X86WriteRes<WriteFStoreNTX, [SBPort23,SBPort4], 1, [1,1], 1>; defm : X86WriteRes<WriteFStoreNTX, [SBPort23,SBPort4], 1, [1,1], 1>;
defm : X86WriteRes<WriteFStoreNTY, [SBPort23,SBPort4], 1, [1,1], 1>; defm : X86WriteRes<WriteFStoreNTY, [SBPort23,SBPort4], 1, [1,1], 1>;
defm : X86WriteRes<WriteFMaskedStore, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
defm : X86WriteRes<WriteFMaskedStoreY, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>; defm : X86WriteRes<WriteFMaskedStore32, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
defm : X86WriteRes<WriteFMaskedStore32Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
defm : X86WriteRes<WriteFMaskedStore64, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
defm : X86WriteRes<WriteFMaskedStore64Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
defm : X86WriteRes<WriteFMove, [SBPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMove, [SBPort5], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveX, [SBPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [SBPort5], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveY, [SBPort5], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [SBPort5], 1, [1], 1>;

View File

@ -226,8 +226,12 @@ defm : X86WriteRes<WriteFStoreY, [SKLPort237,SKLPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNT, [SKLPort237,SKLPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNT, [SKLPort237,SKLPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTX, [SKLPort237,SKLPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTX, [SKLPort237,SKLPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTY, [SKLPort237,SKLPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTY, [SKLPort237,SKLPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFMaskedStore, [SKLPort237,SKLPort0], 2, [1,1], 2>;
defm : X86WriteRes<WriteFMaskedStoreY, [SKLPort237,SKLPort0], 2, [1,1], 2>; defm : X86WriteRes<WriteFMaskedStore32, [SKLPort237,SKLPort0], 2, [1,1], 2>;
defm : X86WriteRes<WriteFMaskedStore32Y, [SKLPort237,SKLPort0], 2, [1,1], 2>;
defm : X86WriteRes<WriteFMaskedStore64, [SKLPort237,SKLPort0], 2, [1,1], 2>;
defm : X86WriteRes<WriteFMaskedStore64Y, [SKLPort237,SKLPort0], 2, [1,1], 2>;
defm : X86WriteRes<WriteFMove, [SKLPort015], 1, [1], 1>; defm : X86WriteRes<WriteFMove, [SKLPort015], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveX, [SKLPort015], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [SKLPort015], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveY, [SKLPort015], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [SKLPort015], 1, [1], 1>;

View File

@ -226,8 +226,12 @@ defm : X86WriteRes<WriteFStoreY, [SKXPort237,SKXPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNT, [SKXPort237,SKXPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNT, [SKXPort237,SKXPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTX, [SKXPort237,SKXPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTX, [SKXPort237,SKXPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFStoreNTY, [SKXPort237,SKXPort4], 1, [1,1], 2>; defm : X86WriteRes<WriteFStoreNTY, [SKXPort237,SKXPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteFMaskedStore, [SKXPort237,SKXPort0], 2, [1,1], 2>;
defm : X86WriteRes<WriteFMaskedStoreY, [SKXPort237,SKXPort0], 2, [1,1], 2>; defm : X86WriteRes<WriteFMaskedStore32, [SKXPort237,SKXPort0], 2, [1,1], 2>;
defm : X86WriteRes<WriteFMaskedStore32Y, [SKXPort237,SKXPort0], 2, [1,1], 2>;
defm : X86WriteRes<WriteFMaskedStore64, [SKXPort237,SKXPort0], 2, [1,1], 2>;
defm : X86WriteRes<WriteFMaskedStore64Y, [SKXPort237,SKXPort0], 2, [1,1], 2>;
defm : X86WriteRes<WriteFMove, [SKXPort015], 1, [1], 1>; defm : X86WriteRes<WriteFMove, [SKXPort015], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveX, [SKXPort015], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [SKXPort015], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveY, [SKXPort015], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [SKXPort015], 1, [1], 1>;

View File

@ -102,6 +102,12 @@ class X86SchedWriteMoveLS<SchedWrite MoveRR,
SchedWrite MR = StoreMR; SchedWrite MR = StoreMR;
} }
// Multiclass that wraps masked load/store writes for a vector width.
class X86SchedWriteMaskMove<SchedWrite LoadRM, SchedWrite StoreMR> {
SchedWrite RM = LoadRM;
SchedWrite MR = StoreMR;
}
// Multiclass that wraps X86SchedWriteMoveLS for each vector width. // Multiclass that wraps X86SchedWriteMoveLS for each vector width.
class X86SchedWriteMoveLSWidths<X86SchedWriteMoveLS sScl, class X86SchedWriteMoveLSWidths<X86SchedWriteMoveLS sScl,
X86SchedWriteMoveLS s128, X86SchedWriteMoveLS s128,
@ -218,8 +224,12 @@ def WriteFStoreY : SchedWrite;
def WriteFStoreNT : SchedWrite; def WriteFStoreNT : SchedWrite;
def WriteFStoreNTX : SchedWrite; def WriteFStoreNTX : SchedWrite;
def WriteFStoreNTY : SchedWrite; def WriteFStoreNTY : SchedWrite;
def WriteFMaskedStore : SchedWrite;
def WriteFMaskedStoreY : SchedWrite; def WriteFMaskedStore32 : SchedWrite;
def WriteFMaskedStore64 : SchedWrite;
def WriteFMaskedStore32Y : SchedWrite;
def WriteFMaskedStore64Y : SchedWrite;
def WriteFMove : SchedWrite; def WriteFMove : SchedWrite;
def WriteFMoveX : SchedWrite; def WriteFMoveX : SchedWrite;
def WriteFMoveY : SchedWrite; def WriteFMoveY : SchedWrite;
@ -530,6 +540,16 @@ def SchedWriteVecMoveLSNT
: X86SchedWriteMoveLSWidths<WriteVecMoveLSNT, WriteVecMoveLSNTX, : X86SchedWriteMoveLSWidths<WriteVecMoveLSNT, WriteVecMoveLSNTX,
WriteVecMoveLSNTY, WriteVecMoveLSNTY>; WriteVecMoveLSNTY, WriteVecMoveLSNTY>;
// Conditional SIMD Packed Loads and Stores wrappers.
def WriteFMaskMove32
: X86SchedWriteMaskMove<WriteFMaskedLoad, WriteFMaskedStore32>;
def WriteFMaskMove64
: X86SchedWriteMaskMove<WriteFMaskedLoad, WriteFMaskedStore64>;
def WriteFMaskMove32Y
: X86SchedWriteMaskMove<WriteFMaskedLoadY, WriteFMaskedStore32Y>;
def WriteFMaskMove64Y
: X86SchedWriteMaskMove<WriteFMaskedLoadY, WriteFMaskedStore64Y>;
// Vector width wrappers. // Vector width wrappers.
def SchedWriteFAdd def SchedWriteFAdd
: X86SchedWriteWidths<WriteFAdd, WriteFAddX, WriteFAddY, WriteFAddZ>; : X86SchedWriteWidths<WriteFAdd, WriteFAddX, WriteFAddY, WriteFAddZ>;

View File

@ -216,8 +216,10 @@ defm : X86WriteResUnsupported<WriteFStoreY>;
def : WriteRes<WriteFStoreNT, [AtomPort0]>; def : WriteRes<WriteFStoreNT, [AtomPort0]>;
def : WriteRes<WriteFStoreNTX, [AtomPort0]>; def : WriteRes<WriteFStoreNTX, [AtomPort0]>;
defm : X86WriteResUnsupported<WriteFStoreNTY>; defm : X86WriteResUnsupported<WriteFStoreNTY>;
defm : X86WriteResUnsupported<WriteFMaskedStore>; defm : X86WriteResUnsupported<WriteFMaskedStore32>;
defm : X86WriteResUnsupported<WriteFMaskedStoreY>; defm : X86WriteResUnsupported<WriteFMaskedStore32Y>;
defm : X86WriteResUnsupported<WriteFMaskedStore64>;
defm : X86WriteResUnsupported<WriteFMaskedStore64Y>;
def : WriteRes<WriteFMove, [AtomPort01]>; def : WriteRes<WriteFMove, [AtomPort01]>;
def : WriteRes<WriteFMoveX, [AtomPort01]>; def : WriteRes<WriteFMoveX, [AtomPort01]>;

View File

@ -726,8 +726,10 @@ defm : PdWriteRes<WriteFStoreNT, [PdStore, PdFPU1, PdFPSTO], 3>;
defm : PdWriteRes<WriteFStoreNTX, [PdStore, PdFPU1, PdFPSTO], 3>; defm : PdWriteRes<WriteFStoreNTX, [PdStore, PdFPU1, PdFPSTO], 3>;
defm : PdWriteRes<WriteFStoreNTY, [PdStore, PdFPU1, PdFPSTO], 3, [2, 2, 2], 4>; defm : PdWriteRes<WriteFStoreNTY, [PdStore, PdFPU1, PdFPSTO], 3, [2, 2, 2], 4>;
defm : PdWriteRes<WriteFMaskedStore, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>; defm : PdWriteRes<WriteFMaskedStore32, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>;
defm : PdWriteRes<WriteFMaskedStoreY, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>; defm : PdWriteRes<WriteFMaskedStore64, [PdStore, PdFPU01, PdFPFMA], 6, [1, 1, 188], 18>;
defm : PdWriteRes<WriteFMaskedStore32Y, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>;
defm : PdWriteRes<WriteFMaskedStore64Y, [PdStore, PdFPU01, PdFPFMA], 6, [2, 2, 376], 34>;
defm : PdWriteRes<WriteFMove, [PdFPU01, PdFPFMA]>; defm : PdWriteRes<WriteFMove, [PdFPU01, PdFPFMA]>;
defm : PdWriteRes<WriteFMoveX, [PdFPU01, PdFPFMA], 1, [1, 2]>; defm : PdWriteRes<WriteFMoveX, [PdFPU01, PdFPFMA], 1, [1, 2]>;

View File

@ -512,8 +512,11 @@ defm : X86WriteRes<WriteFStoreY, [JSAGU, JFPU1, JSTC], 1, [1, 1, 1], 1>;
defm : X86WriteRes<WriteFStoreNT, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>; defm : X86WriteRes<WriteFStoreNT, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>;
defm : X86WriteRes<WriteFStoreNTX, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>; defm : X86WriteRes<WriteFStoreNTX, [JSAGU, JFPU1, JSTC], 3, [1, 1, 1], 1>;
defm : X86WriteRes<WriteFStoreNTY, [JSAGU, JFPU1, JSTC], 3, [2, 2, 2], 1>; defm : X86WriteRes<WriteFStoreNTY, [JSAGU, JFPU1, JSTC], 3, [2, 2, 2], 1>;
defm : X86WriteRes<WriteFMaskedStore, [JSAGU, JFPU01, JFPX], 6, [1, 1, 4], 1>;
defm : X86WriteRes<WriteFMaskedStoreY, [JSAGU, JFPU01, JFPX], 6, [2, 2, 4], 2>; defm : X86WriteRes<WriteFMaskedStore32, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 5, 5,4,4,4], 19>;
defm : X86WriteRes<WriteFMaskedStore64, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 13, [1,1, 2, 2,2,2,2], 10>;
defm : X86WriteRes<WriteFMaskedStore32Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 22, [1,1,10,10,8,8,8], 36>;
defm : X86WriteRes<WriteFMaskedStore64Y, [JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01], 16, [1,1, 4, 4,4,4,4], 18>;
defm : X86WriteRes<WriteFMove, [JFPU01, JFPX], 1, [1, 1], 1>; defm : X86WriteRes<WriteFMove, [JFPU01, JFPX], 1, [1, 1], 1>;
defm : X86WriteRes<WriteFMoveX, [JFPU01, JFPX], 1, [1, 1], 1>; defm : X86WriteRes<WriteFMoveX, [JFPU01, JFPX], 1, [1, 1], 1>;
@ -818,6 +821,18 @@ def JWriteJVZEROUPPER: SchedWriteRes<[]> {
} }
def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>; def : InstRW<[JWriteJVZEROUPPER], (instrs VZEROUPPER)>;
///////////////////////////////////////////////////////////////////////////////
// SSE2/AVX Store Selected Bytes of Double Quadword - (V)MASKMOVDQ
///////////////////////////////////////////////////////////////////////////////
def JWriteMASKMOVDQU: SchedWriteRes<[JFPU0, JFPA, JFPU1, JSTC, JLAGU, JSAGU, JALU01]> {
let Latency = 34;
let ResourceCycles = [1, 1, 2, 2, 2, 16, 42];
let NumMicroOps = 63;
}
def : InstRW<[JWriteMASKMOVDQU], (instrs MASKMOVDQU, MASKMOVDQU64,
VMASKMOVDQU, VMASKMOVDQU64)>;
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////
// SchedWriteVariant definitions. // SchedWriteVariant definitions.
/////////////////////////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////////////

View File

@ -186,8 +186,12 @@ def : WriteRes<WriteFStoreY, [SLM_MEC_RSV]>;
def : WriteRes<WriteFStoreNT, [SLM_MEC_RSV]>; def : WriteRes<WriteFStoreNT, [SLM_MEC_RSV]>;
def : WriteRes<WriteFStoreNTX, [SLM_MEC_RSV]>; def : WriteRes<WriteFStoreNTX, [SLM_MEC_RSV]>;
def : WriteRes<WriteFStoreNTY, [SLM_MEC_RSV]>; def : WriteRes<WriteFStoreNTY, [SLM_MEC_RSV]>;
def : WriteRes<WriteFMaskedStore, [SLM_MEC_RSV]>;
def : WriteRes<WriteFMaskedStoreY, [SLM_MEC_RSV]>; def : WriteRes<WriteFMaskedStore32, [SLM_MEC_RSV]>;
def : WriteRes<WriteFMaskedStore32Y, [SLM_MEC_RSV]>;
def : WriteRes<WriteFMaskedStore64, [SLM_MEC_RSV]>;
def : WriteRes<WriteFMaskedStore64Y, [SLM_MEC_RSV]>;
def : WriteRes<WriteFMove, [SLM_FPC_RSV01]>; def : WriteRes<WriteFMove, [SLM_FPC_RSV01]>;
def : WriteRes<WriteFMoveX, [SLM_FPC_RSV01]>; def : WriteRes<WriteFMoveX, [SLM_FPC_RSV01]>;
def : WriteRes<WriteFMoveY, [SLM_FPC_RSV01]>; def : WriteRes<WriteFMoveY, [SLM_FPC_RSV01]>;

View File

@ -268,8 +268,12 @@ defm : X86WriteRes<WriteFStoreY, [ZnAGU], 1, [1], 1>;
defm : X86WriteRes<WriteFStoreNT, [ZnAGU,ZnFPU2], 8, [1,1], 1>; defm : X86WriteRes<WriteFStoreNT, [ZnAGU,ZnFPU2], 8, [1,1], 1>;
defm : X86WriteRes<WriteFStoreNTX, [ZnAGU], 1, [1], 1>; defm : X86WriteRes<WriteFStoreNTX, [ZnAGU], 1, [1], 1>;
defm : X86WriteRes<WriteFStoreNTY, [ZnAGU], 1, [1], 1>; defm : X86WriteRes<WriteFStoreNTY, [ZnAGU], 1, [1], 1>;
defm : X86WriteRes<WriteFMaskedStore, [ZnAGU,ZnFPU01], 4, [1,1], 1>;
defm : X86WriteRes<WriteFMaskedStoreY, [ZnAGU,ZnFPU01], 5, [1,2], 2>; defm : X86WriteRes<WriteFMaskedStore32, [ZnAGU,ZnFPU01], 4, [1,1], 1>;
defm : X86WriteRes<WriteFMaskedStore32Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
defm : X86WriteRes<WriteFMaskedStore64, [ZnAGU,ZnFPU01], 4, [1,1], 1>;
defm : X86WriteRes<WriteFMaskedStore64Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
defm : X86WriteRes<WriteFMove, [ZnFPU], 1, [1], 1>; defm : X86WriteRes<WriteFMove, [ZnFPU], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveX, [ZnFPU], 1, [1], 1>; defm : X86WriteRes<WriteFMoveX, [ZnFPU], 1, [1], 1>;
defm : X86WriteRes<WriteFMoveY, [ZnFPU], 1, [1], 1>; defm : X86WriteRes<WriteFMoveY, [ZnFPU], 1, [1], 1>;

View File

@ -1219,15 +1219,15 @@ vzeroupper
# CHECK-NEXT: 1 5 1.00 * vlddqu (%rax), %xmm2 # CHECK-NEXT: 1 5 1.00 * vlddqu (%rax), %xmm2
# CHECK-NEXT: 1 5 1.00 * vlddqu (%rax), %ymm2 # CHECK-NEXT: 1 5 1.00 * vlddqu (%rax), %ymm2
# CHECK-NEXT: 1 3 1.00 * U vldmxcsr (%rax) # CHECK-NEXT: 1 3 1.00 * U vldmxcsr (%rax)
# CHECK-NEXT: 1 1 1.00 * * U vmaskmovdqu %xmm0, %xmm1 # CHECK-NEXT: 63 34 21.00 * * U vmaskmovdqu %xmm0, %xmm1
# CHECK-NEXT: 1 6 1.00 * vmaskmovpd (%rax), %xmm0, %xmm2 # CHECK-NEXT: 1 6 1.00 * vmaskmovpd (%rax), %xmm0, %xmm2
# CHECK-NEXT: 2 6 2.00 * vmaskmovpd (%rax), %ymm0, %ymm2 # CHECK-NEXT: 2 6 2.00 * vmaskmovpd (%rax), %ymm0, %ymm2
# CHECK-NEXT: 1 6 2.00 * * vmaskmovpd %xmm0, %xmm1, (%rax) # CHECK-NEXT: 10 13 2.00 * * vmaskmovpd %xmm0, %xmm1, (%rax)
# CHECK-NEXT: 2 6 2.00 * * vmaskmovpd %ymm0, %ymm1, (%rax) # CHECK-NEXT: 18 16 4.00 * * vmaskmovpd %ymm0, %ymm1, (%rax)
# CHECK-NEXT: 1 6 1.00 * vmaskmovps (%rax), %xmm0, %xmm2 # CHECK-NEXT: 1 6 1.00 * vmaskmovps (%rax), %xmm0, %xmm2
# CHECK-NEXT: 2 6 2.00 * vmaskmovps (%rax), %ymm0, %ymm2 # CHECK-NEXT: 2 6 2.00 * vmaskmovps (%rax), %ymm0, %ymm2
# CHECK-NEXT: 1 6 2.00 * * vmaskmovps %xmm0, %xmm1, (%rax) # CHECK-NEXT: 19 16 5.00 * * vmaskmovps %xmm0, %xmm1, (%rax)
# CHECK-NEXT: 2 6 2.00 * * vmaskmovps %ymm0, %ymm1, (%rax) # CHECK-NEXT: 36 22 10.00 * * vmaskmovps %ymm0, %ymm1, (%rax)
# CHECK-NEXT: 1 2 1.00 vmaxpd %xmm0, %xmm1, %xmm2 # CHECK-NEXT: 1 2 1.00 vmaxpd %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1 7 1.00 * vmaxpd (%rax), %xmm1, %xmm2 # CHECK-NEXT: 1 7 1.00 * vmaxpd (%rax), %xmm1, %xmm2
# CHECK-NEXT: 2 2 2.00 vmaxpd %ymm0, %ymm1, %ymm2 # CHECK-NEXT: 2 2 2.00 vmaxpd %ymm0, %ymm1, %ymm2
@ -1740,7 +1740,7 @@ vzeroupper
# CHECK: Resource pressure per iteration: # CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
# CHECK-NEXT: 56.00 - - 365.00 915.00 447.50 461.50 394.00 - 51.00 132.00 135.50 159.50 38.00 # CHECK-NEXT: 86.00 30.00 - 362.00 907.00 449.50 480.50 414.00 - 78.00 154.00 135.50 159.50 38.00
# CHECK: Resource pressure by instruction: # CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
@ -1933,15 +1933,15 @@ vzeroupper
# CHECK-NEXT: - - - - - 0.50 0.50 1.00 - - - 0.50 0.50 - vlddqu (%rax), %xmm2 # CHECK-NEXT: - - - - - 0.50 0.50 1.00 - - - 0.50 0.50 - vlddqu (%rax), %xmm2
# CHECK-NEXT: - - - - - 0.50 0.50 1.00 - - - 0.50 0.50 - vlddqu (%rax), %ymm2 # CHECK-NEXT: - - - - - 0.50 0.50 1.00 - - - 0.50 0.50 - vlddqu (%rax), %ymm2
# CHECK-NEXT: - - - - - - - 1.00 - - - - - - vldmxcsr (%rax) # CHECK-NEXT: - - - - - - - 1.00 - - - - - - vldmxcsr (%rax)
# CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - vmaskmovdqu %xmm0, %xmm1 # CHECK-NEXT: 21.00 21.00 - 1.00 - 1.00 2.00 2.00 - 16.00 2.00 - - - vmaskmovdqu %xmm0, %xmm1
# CHECK-NEXT: - - - 1.00 1.00 1.00 1.00 1.00 - - - - - - vmaskmovpd (%rax), %xmm0, %xmm2 # CHECK-NEXT: - - - 1.00 1.00 1.00 1.00 1.00 - - - - - - vmaskmovpd (%rax), %xmm0, %xmm2
# CHECK-NEXT: - - - 2.00 2.00 2.00 2.00 2.00 - - - - - - vmaskmovpd (%rax), %ymm0, %ymm2 # CHECK-NEXT: - - - 2.00 2.00 2.00 2.00 2.00 - - - - - - vmaskmovpd (%rax), %ymm0, %ymm2
# CHECK-NEXT: - - - 2.00 2.00 0.50 0.50 - - 1.00 - - - - vmaskmovpd %xmm0, %xmm1, (%rax) # CHECK-NEXT: 1.00 1.00 - 1.00 - 1.00 2.00 2.00 - 2.00 2.00 - - - vmaskmovpd %xmm0, %xmm1, (%rax)
# CHECK-NEXT: - - - 2.00 2.00 1.00 1.00 - - 2.00 - - - - vmaskmovpd %ymm0, %ymm1, (%rax) # CHECK-NEXT: 2.00 2.00 - 1.00 - 1.00 4.00 4.00 - 4.00 4.00 - - - vmaskmovpd %ymm0, %ymm1, (%rax)
# CHECK-NEXT: - - - 1.00 1.00 1.00 1.00 1.00 - - - - - - vmaskmovps (%rax), %xmm0, %xmm2 # CHECK-NEXT: - - - 1.00 1.00 1.00 1.00 1.00 - - - - - - vmaskmovps (%rax), %xmm0, %xmm2
# CHECK-NEXT: - - - 2.00 2.00 2.00 2.00 2.00 - - - - - - vmaskmovps (%rax), %ymm0, %ymm2 # CHECK-NEXT: - - - 2.00 2.00 2.00 2.00 2.00 - - - - - - vmaskmovps (%rax), %ymm0, %ymm2
# CHECK-NEXT: - - - 2.00 2.00 0.50 0.50 - - 1.00 - - - - vmaskmovps %xmm0, %xmm1, (%rax) # CHECK-NEXT: 2.00 2.00 - 1.00 - 1.00 5.00 4.00 - 4.00 5.00 - - - vmaskmovps %xmm0, %xmm1, (%rax)
# CHECK-NEXT: - - - 2.00 2.00 1.00 1.00 - - 2.00 - - - - vmaskmovps %ymm0, %ymm1, (%rax) # CHECK-NEXT: 4.00 4.00 - 1.00 - 1.00 10.00 8.00 - 8.00 10.00 - - - vmaskmovps %ymm0, %ymm1, (%rax)
# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - vmaxpd %xmm0, %xmm1, %xmm2 # CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - vmaxpd %xmm0, %xmm1, %xmm2
# CHECK-NEXT: - - - 1.00 - 1.00 - 1.00 - - - - - - vmaxpd (%rax), %xmm1, %xmm2 # CHECK-NEXT: - - - 1.00 - 1.00 - 1.00 - - - - - - vmaxpd (%rax), %xmm1, %xmm2
# CHECK-NEXT: - - - 2.00 - 2.00 - - - - - - - - vmaxpd %ymm0, %ymm1, %ymm2 # CHECK-NEXT: - - - 2.00 - 2.00 - - - - - - - - vmaxpd %ymm0, %ymm1, %ymm2

View File

@ -465,7 +465,7 @@ xorpd (%rax), %xmm2
# CHECK-NEXT: 1 19 19.00 divsd %xmm0, %xmm2 # CHECK-NEXT: 1 19 19.00 divsd %xmm0, %xmm2
# CHECK-NEXT: 1 24 19.00 * divsd (%rax), %xmm2 # CHECK-NEXT: 1 24 19.00 * divsd (%rax), %xmm2
# CHECK-NEXT: 1 1 1.00 * * U lfence # CHECK-NEXT: 1 1 1.00 * * U lfence
# CHECK-NEXT: 1 1 1.00 * * U maskmovdqu %xmm0, %xmm1 # CHECK-NEXT: 63 34 21.00 * * U maskmovdqu %xmm0, %xmm1
# CHECK-NEXT: 1 2 1.00 maxpd %xmm0, %xmm2 # CHECK-NEXT: 1 2 1.00 maxpd %xmm0, %xmm2
# CHECK-NEXT: 1 7 1.00 * maxpd (%rax), %xmm2 # CHECK-NEXT: 1 7 1.00 * maxpd (%rax), %xmm2
# CHECK-NEXT: 1 2 1.00 maxsd %xmm0, %xmm2 # CHECK-NEXT: 1 2 1.00 maxsd %xmm0, %xmm2
@ -693,7 +693,7 @@ xorpd (%rax), %xmm2
# CHECK: Resource pressure per iteration: # CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
# CHECK-NEXT: 17.00 - - 49.00 204.00 128.50 141.50 118.00 - 16.00 54.00 67.50 67.50 12.00 # CHECK-NEXT: 38.00 21.00 - 50.00 204.00 129.50 142.50 120.00 - 31.00 55.00 67.50 67.50 12.00
# CHECK: Resource pressure by instruction: # CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
@ -755,7 +755,7 @@ xorpd (%rax), %xmm2
# CHECK-NEXT: - - - - 19.00 - 1.00 - - - - - - - divsd %xmm0, %xmm2 # CHECK-NEXT: - - - - 19.00 - 1.00 - - - - - - - divsd %xmm0, %xmm2
# CHECK-NEXT: - - - - 19.00 - 1.00 1.00 - - - - - - divsd (%rax), %xmm2 # CHECK-NEXT: - - - - 19.00 - 1.00 1.00 - - - - - - divsd (%rax), %xmm2
# CHECK-NEXT: - - - - - - - - - 1.00 - - - - lfence # CHECK-NEXT: - - - - - - - - - 1.00 - - - - lfence
# CHECK-NEXT: - - - - - - 1.00 - - 1.00 1.00 - - - maskmovdqu %xmm0, %xmm1 # CHECK-NEXT: 21.00 21.00 - 1.00 - 1.00 2.00 2.00 - 16.00 2.00 - - - maskmovdqu %xmm0, %xmm1
# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - maxpd %xmm0, %xmm2 # CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - maxpd %xmm0, %xmm2
# CHECK-NEXT: - - - 1.00 - 1.00 - 1.00 - - - - - - maxpd (%rax), %xmm2 # CHECK-NEXT: - - - 1.00 - 1.00 - 1.00 - - - - - - maxpd (%rax), %xmm2
# CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - maxsd %xmm0, %xmm2 # CHECK-NEXT: - - - 1.00 - 1.00 - - - - - - - - maxsd %xmm0, %xmm2