From 586aaeabf123c14b6094e9bbfdbd92f693293c7d Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Tue, 15 Jun 2021 11:55:46 +0300 Subject: [PATCH] [X86] Schedule-model second (mask) output of GATHER instruction Much like `mulx`'s `WriteIMulH`, there are two outputs of AVX2 GATHER instructions. This was changed back in rL160110, but the sched model change wasn't present. So right now, for sched models that are marked as complete (`znver3` only now), codegen'ning `GATHER` results in a crash: ``` DefIdx 1 exceeds machine model writes for early-clobber renamable $ymm3, dead early-clobber renamable $ymm2 = VPGATHERDDYrm killed renamable $ymm3(tied-def 0), undef renamable $rax, 4, renamable $ymm0, 0, $noreg, killed renamable $ymm2(tied-def 1) :: (load 32, align 1) ``` https://godbolt.org/z/Ks7zW7WGh I'm guessing we need to deal with this like we deal with `WriteIMulH`. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D104205 --- lib/Target/X86/X86InstrAVX512.td | 3 ++- lib/Target/X86/X86InstrSSE.td | 4 ++-- lib/Target/X86/X86SchedBroadwell.td | 4 ++++ lib/Target/X86/X86SchedHaswell.td | 4 ++++ lib/Target/X86/X86SchedSandyBridge.td | 1 + lib/Target/X86/X86SchedSkylakeClient.td | 4 ++++ lib/Target/X86/X86SchedSkylakeServer.td | 4 ++++ lib/Target/X86/X86Schedule.td | 1 + lib/Target/X86/X86ScheduleAtom.td | 1 + lib/Target/X86/X86ScheduleBdVer2.td | 1 + lib/Target/X86/X86ScheduleBtVer2.td | 1 + lib/Target/X86/X86ScheduleSLM.td | 1 + lib/Target/X86/X86ScheduleZnver1.td | 4 ++++ lib/Target/X86/X86ScheduleZnver2.td | 4 ++++ lib/Target/X86/X86ScheduleZnver3.td | 4 ++++ 15 files changed, 38 insertions(+), 3 deletions(-) diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index b16fc068331..dd61d91c3a6 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -9677,7 +9677,8 @@ multiclass avx512_gather opc, string OpcodeStr, X86VectorVTInfo _, (ins _.RC:$src1, MaskRC:$mask, memop:$src2), !strconcat(OpcodeStr#_.Suffix, "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), - []>, EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteLoad]>; + []>, EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>, + Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>; } multiclass avx512_gather_q_pd dopc, bits<8> qopc, diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 2558aec6136..ae40b712edf 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -7882,12 +7882,12 @@ let mayLoad = 1, hasSideEffects = 0 in { (ins VR128:$src1, memop128:$src2, VR128:$mask), !strconcat(OpcodeStr, "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), - []>, VEX, Sched<[WriteLoad]>; + []>, VEX, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>; def Yrm : AVX28I, VEX, VEX_L, Sched<[WriteLoad]>; + []>, VEX, VEX_L, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>; } } diff --git a/lib/Target/X86/X86SchedBroadwell.td b/lib/Target/X86/X86SchedBroadwell.td index f48a7720bb7..d2ced1c6740 100644 --- a/lib/Target/X86/X86SchedBroadwell.td +++ b/lib/Target/X86/X86SchedBroadwell.td @@ -206,6 +206,10 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +defm : X86WriteRes; + // Idioms that clear a register, like xorps %xmm0, %xmm0. // These can often bypass execution ports completely. def : WriteRes; diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td index c72e9caea33..7ce259b20cb 100644 --- a/lib/Target/X86/X86SchedHaswell.td +++ b/lib/Target/X86/X86SchedHaswell.td @@ -125,6 +125,10 @@ defm : X86WriteRes; defm : X86WriteRes; def : WriteRes; +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +defm : X86WriteRes; + // Arithmetic. defm : HWWriteResPair; defm : HWWriteResPair; diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td index aa4bfcda36d..2f7157f4326 100644 --- a/lib/Target/X86/X86SchedSandyBridge.td +++ b/lib/Target/X86/X86SchedSandyBridge.td @@ -112,6 +112,7 @@ def : WriteRes; def : WriteRes { let Latency = 5; } def : WriteRes; def : WriteRes; +def : WriteRes { let Latency = 5; let NumMicroOps = 0; } // Arithmetic. defm : SBWriteResPair; diff --git a/lib/Target/X86/X86SchedSkylakeClient.td b/lib/Target/X86/X86SchedSkylakeClient.td index 7b99a8456f1..8486bdda034 100644 --- a/lib/Target/X86/X86SchedSkylakeClient.td +++ b/lib/Target/X86/X86SchedSkylakeClient.td @@ -203,6 +203,10 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +defm : X86WriteRes; + // Idioms that clear a register, like xorps %xmm0, %xmm0. // These can often bypass execution ports completely. def : WriteRes; diff --git a/lib/Target/X86/X86SchedSkylakeServer.td b/lib/Target/X86/X86SchedSkylakeServer.td index 09d40e36eb6..ba80d47c4eb 100644 --- a/lib/Target/X86/X86SchedSkylakeServer.td +++ b/lib/Target/X86/X86SchedSkylakeServer.td @@ -203,6 +203,10 @@ defm : X86WriteRes; defm : X86WriteRes; defm : X86WriteRes; +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +defm : X86WriteRes; + // Idioms that clear a register, like xorps %xmm0, %xmm0. // These can often bypass execution ports completely. def : WriteRes; diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td index 2aa5fb4b86e..09148fc19e5 100644 --- a/lib/Target/X86/X86Schedule.td +++ b/lib/Target/X86/X86Schedule.td @@ -125,6 +125,7 @@ def WriteLoad : SchedWrite; def WriteStore : SchedWrite; def WriteStoreNT : SchedWrite; def WriteMove : SchedWrite; +def WriteVecMaskedGatherWriteback : SchedWrite; def WriteCopy : WriteSequence<[WriteLoad, WriteStore]>; // mem->mem copy // Arithmetic. diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td index 8ec278f14d8..d441969c58e 100644 --- a/lib/Target/X86/X86ScheduleAtom.td +++ b/lib/Target/X86/X86ScheduleAtom.td @@ -165,6 +165,7 @@ def : WriteRes; def : WriteRes; def : WriteRes; def : WriteRes; +defm : X86WriteResUnsupported; // Treat misc copies as a move. def : InstRW<[WriteMove], (instrs COPY)>; diff --git a/lib/Target/X86/X86ScheduleBdVer2.td b/lib/Target/X86/X86ScheduleBdVer2.td index d3244bcfbb1..99d4011dae7 100644 --- a/lib/Target/X86/X86ScheduleBdVer2.td +++ b/lib/Target/X86/X86ScheduleBdVer2.td @@ -269,6 +269,7 @@ def : WriteRes { let Latency = 5; let ResourceCycles = [ def : WriteRes; def : WriteRes; def : WriteRes { let ResourceCycles = [2]; } +defm : X86WriteResUnsupported; // Load/store MXCSR. // FIXME: These are copy and pasted from WriteLoad/Store. diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td index 6ba6183d76e..ef156b57315 100644 --- a/lib/Target/X86/X86ScheduleBtVer2.td +++ b/lib/Target/X86/X86ScheduleBtVer2.td @@ -273,6 +273,7 @@ def : WriteRes { let Latency = 3; } def : WriteRes; def : WriteRes; def : WriteRes; +defm : X86WriteResUnsupported; // Load/store MXCSR. def : WriteRes { let Latency = 3; } diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td index 98e5a2ede05..1f47d1aa404 100644 --- a/lib/Target/X86/X86ScheduleSLM.td +++ b/lib/Target/X86/X86ScheduleSLM.td @@ -88,6 +88,7 @@ def : WriteRes; def : WriteRes { let Latency = 3; } def : WriteRes; def : WriteRes; +defm : X86WriteResUnsupported; // Load/store MXCSR. // FIXME: These are probably wrong. They are copy pasted from WriteStore/Load. diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td index 93f9d5de17c..3cb2f17d674 100644 --- a/lib/Target/X86/X86ScheduleZnver1.td +++ b/lib/Target/X86/X86ScheduleZnver1.td @@ -179,6 +179,10 @@ def : WriteRes; def : WriteRes; def : WriteRes { let Latency = 8; } +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +def : WriteRes { let Latency = 8; let NumMicroOps = 0; } + def : WriteRes; def : WriteRes; defm : ZnWriteResPair; diff --git a/lib/Target/X86/X86ScheduleZnver2.td b/lib/Target/X86/X86ScheduleZnver2.td index aebe313c0e7..f5576c972bb 100644 --- a/lib/Target/X86/X86ScheduleZnver2.td +++ b/lib/Target/X86/X86ScheduleZnver2.td @@ -178,6 +178,10 @@ def : WriteRes; def : WriteRes; def : WriteRes { let Latency = 8; } +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +def : WriteRes { let Latency = 8; let NumMicroOps = 0; } + def : WriteRes; def : WriteRes; defm : Zn2WriteResPair; diff --git a/lib/Target/X86/X86ScheduleZnver3.td b/lib/Target/X86/X86ScheduleZnver3.td index 599f0b5de74..bc92f0dcd04 100644 --- a/lib/Target/X86/X86ScheduleZnver3.td +++ b/lib/Target/X86/X86ScheduleZnver3.td @@ -495,6 +495,10 @@ defm : Zn3WriteResInt; +// Model the effect of clobbering the read-write mask operand of the GATHER operation. +// Does not cost anything by itself, only has latency, matching that of the WriteLoad, +defm : Zn3WriteResInt; + def Zn3WriteMOVSlow : SchedWriteRes<[Zn3AGU012, Zn3Load]> { let Latency = !add(Znver3Model.LoadLatency, 1); let ResourceCycles = [3, 1];