diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index b16fc068331..dd61d91c3a6 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -9677,7 +9677,8 @@ multiclass avx512_gather opc, string OpcodeStr, X86VectorVTInfo _, (ins _.RC:$src1, MaskRC:$mask, memop:$src2), !strconcat(OpcodeStr#_.Suffix, "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), - []>, EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteLoad]>; + []>, EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>, + Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>; } multiclass avx512_gather_q_pd dopc, bits<8> qopc, diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 2558aec6136..ae40b712edf 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -7882,12 +7882,12 @@ let mayLoad = 1, hasSideEffects = 0 in { (ins VR128:$src1, memop128:$src2, VR128:$mask), !strconcat(OpcodeStr, "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), - []>, VEX, Sched<[WriteLoad]>; + []>, VEX, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>; def Yrm : AVX28I, VEX, VEX_L, Sched<[WriteLoad]>; + []>, VEX, VEX_L, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>; } } diff --git a/lib/Target/X86/X86SchedBroadwell.td b/lib/Target/X86/X86SchedBroadwell.td index f48a7720bb7..d2ced1c6740 100644 --- a/lib/Target/X86/X86SchedBroadwell.td +++ b/lib/Target/X86/X86SchedBroadwell.td @@ -206,6 +206,10 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +defm : X86WriteRes; + // Idioms that clear a register, like xorps %xmm0, %xmm0. // These can often bypass execution ports completely. def : WriteRes; diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td index c72e9caea33..7ce259b20cb 100644 --- a/lib/Target/X86/X86SchedHaswell.td +++ b/lib/Target/X86/X86SchedHaswell.td @@ -125,6 +125,10 @@ defm : X86WriteRes; defm : X86WriteRes; def : WriteRes; +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +defm : X86WriteRes; + // Arithmetic. defm : HWWriteResPair; defm : HWWriteResPair; diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td index aa4bfcda36d..2f7157f4326 100644 --- a/lib/Target/X86/X86SchedSandyBridge.td +++ b/lib/Target/X86/X86SchedSandyBridge.td @@ -112,6 +112,7 @@ def : WriteRes; def : WriteRes { let Latency = 5; } def : WriteRes; def : WriteRes; +def : WriteRes { let Latency = 5; let NumMicroOps = 0; } // Arithmetic. defm : SBWriteResPair; diff --git a/lib/Target/X86/X86SchedSkylakeClient.td b/lib/Target/X86/X86SchedSkylakeClient.td index 7b99a8456f1..8486bdda034 100644 --- a/lib/Target/X86/X86SchedSkylakeClient.td +++ b/lib/Target/X86/X86SchedSkylakeClient.td @@ -203,6 +203,10 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +defm : X86WriteRes; + // Idioms that clear a register, like xorps %xmm0, %xmm0. // These can often bypass execution ports completely. def : WriteRes; diff --git a/lib/Target/X86/X86SchedSkylakeServer.td b/lib/Target/X86/X86SchedSkylakeServer.td index 09d40e36eb6..ba80d47c4eb 100644 --- a/lib/Target/X86/X86SchedSkylakeServer.td +++ b/lib/Target/X86/X86SchedSkylakeServer.td @@ -203,6 +203,10 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +defm : X86WriteRes; + // Idioms that clear a register, like xorps %xmm0, %xmm0. // These can often bypass execution ports completely. def : WriteRes; diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td index 2aa5fb4b86e..09148fc19e5 100644 --- a/lib/Target/X86/X86Schedule.td +++ b/lib/Target/X86/X86Schedule.td @@ -125,6 +125,7 @@ def WriteLoad : SchedWrite; def WriteStore : SchedWrite; def WriteStoreNT : SchedWrite; def WriteMove : SchedWrite; +def WriteVecMaskedGatherWriteback : SchedWrite; def WriteCopy : WriteSequence<[WriteLoad, WriteStore]>; // mem->mem copy // Arithmetic. diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td index 8ec278f14d8..d441969c58e 100644 --- a/lib/Target/X86/X86ScheduleAtom.td +++ b/lib/Target/X86/X86ScheduleAtom.td @@ -165,6 +165,7 @@ def : WriteRes; def : WriteRes; def : WriteRes; def : WriteRes; +defm : X86WriteResUnsupported; // Treat misc copies as a move. def : InstRW<[WriteMove], (instrs COPY)>; diff --git a/lib/Target/X86/X86ScheduleBdVer2.td b/lib/Target/X86/X86ScheduleBdVer2.td index d3244bcfbb1..99d4011dae7 100644 --- a/lib/Target/X86/X86ScheduleBdVer2.td +++ b/lib/Target/X86/X86ScheduleBdVer2.td @@ -269,6 +269,7 @@ def : WriteRes { let Latency = 5; let ResourceCycles = [ def : WriteRes; def : WriteRes; def : WriteRes { let ResourceCycles = [2]; } +defm : X86WriteResUnsupported; // Load/store MXCSR. // FIXME: These are copy and pasted from WriteLoad/Store. diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td index 6ba6183d76e..ef156b57315 100644 --- a/lib/Target/X86/X86ScheduleBtVer2.td +++ b/lib/Target/X86/X86ScheduleBtVer2.td @@ -273,6 +273,7 @@ def : WriteRes { let Latency = 3; } def : WriteRes; def : WriteRes; def : WriteRes; +defm : X86WriteResUnsupported; // Load/store MXCSR. def : WriteRes { let Latency = 3; } diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td index 98e5a2ede05..1f47d1aa404 100644 --- a/lib/Target/X86/X86ScheduleSLM.td +++ b/lib/Target/X86/X86ScheduleSLM.td @@ -88,6 +88,7 @@ def : WriteRes; def : WriteRes { let Latency = 3; } def : WriteRes; def : WriteRes; +defm : X86WriteResUnsupported; // Load/store MXCSR. // FIXME: These are probably wrong. They are copy pasted from WriteStore/Load. diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td index 93f9d5de17c..3cb2f17d674 100644 --- a/lib/Target/X86/X86ScheduleZnver1.td +++ b/lib/Target/X86/X86ScheduleZnver1.td @@ -179,6 +179,10 @@ def : WriteRes; def : WriteRes; def : WriteRes { let Latency = 8; } +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +def : WriteRes { let Latency = 8; let NumMicroOps = 0; } + def : WriteRes; def : WriteRes; defm : ZnWriteResPair; diff --git a/lib/Target/X86/X86ScheduleZnver2.td b/lib/Target/X86/X86ScheduleZnver2.td index aebe313c0e7..f5576c972bb 100644 --- a/lib/Target/X86/X86ScheduleZnver2.td +++ b/lib/Target/X86/X86ScheduleZnver2.td @@ -178,6 +178,10 @@ def : WriteRes; def : WriteRes; def : WriteRes { let Latency = 8; } +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +def : WriteRes { let Latency = 8; let NumMicroOps = 0; } + def : WriteRes; def : WriteRes; defm : Zn2WriteResPair; diff --git a/lib/Target/X86/X86ScheduleZnver3.td b/lib/Target/X86/X86ScheduleZnver3.td index 599f0b5de74..bc92f0dcd04 100644 --- a/lib/Target/X86/X86ScheduleZnver3.td +++ b/lib/Target/X86/X86ScheduleZnver3.td @@ -495,6 +495,10 @@ defm : Zn3WriteResInt; +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +defm : Zn3WriteResInt; + def Zn3WriteMOVSlow : SchedWriteRes<[Zn3AGU012, Zn3Load]> { let Latency = !add(Znver3Model.LoadLatency, 1); let ResourceCycles = [3, 1];