1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 10:42:39 +01:00

[X86] Schedule-model second (mask) output of GATHER instruction

Much like `mulx`'s `WriteIMulH`, there are two outputs of
AVX2 GATHER instructions. This was changed back in rL160110,
but the sched model change wasn't present.

So right now, for sched models that are marked as complete
(`znver3` only now), codegen'ning `GATHER` results in a crash:
```
DefIdx 1 exceeds machine model writes for early-clobber renamable $ymm3, dead early-clobber renamable $ymm2 = VPGATHERDDYrm killed renamable $ymm3(tied-def 0), undef renamable $rax, 4, renamable $ymm0, 0, $noreg, killed renamable $ymm2(tied-def 1) :: (load 32, align 1)
```
https://godbolt.org/z/Ks7zW7WGh

I'm guessing we need to deal with this like we deal with `WriteIMulH`.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D104205
This commit is contained in:
Roman Lebedev 2021-06-15 11:55:46 +03:00
parent ad80113e58
commit 586aaeabf1
15 changed files with 38 additions and 3 deletions

View File

@ -9677,7 +9677,8 @@ multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
(ins _.RC:$src1, MaskRC:$mask, memop:$src2),
!strconcat(OpcodeStr#_.Suffix,
"\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
[]>, EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteLoad]>;
[]>, EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>,
Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>;
}
multiclass avx512_gather_q_pd<bits<8> dopc, bits<8> qopc,

View File

@ -7882,12 +7882,12 @@ let mayLoad = 1, hasSideEffects = 0 in {
(ins VR128:$src1, memop128:$src2, VR128:$mask),
!strconcat(OpcodeStr,
"\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
[]>, VEX, Sched<[WriteLoad]>;
[]>, VEX, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>;
def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
(ins RC256:$src1, memop256:$src2, RC256:$mask),
!strconcat(OpcodeStr,
"\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
[]>, VEX, VEX_L, Sched<[WriteLoad]>;
[]>, VEX, VEX_L, Sched<[WriteLoad, WriteVecMaskedGatherWriteback]>;
}
}

View File

@ -206,6 +206,10 @@ defm : X86WriteRes<WriteStore, [BWPort237, BWPort4], 1, [1,1], 1>;
defm : X86WriteRes<WriteStoreNT, [BWPort237, BWPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteMove, [BWPort0156], 1, [1], 1>;
// Model the effect of clobbering the read-write mask operand of the GATHER operation.
// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
defm : X86WriteRes<WriteVecMaskedGatherWriteback, [], 5, [], 0>;
// Idioms that clear a register, like xorps %xmm0, %xmm0.
// These can often bypass execution ports completely.
def : WriteRes<WriteZero, []>;

View File

@ -125,6 +125,10 @@ defm : X86WriteRes<WriteLoad, [HWPort23], 5, [1], 1>;
defm : X86WriteRes<WriteMove, [HWPort0156], 1, [1], 1>;
def : WriteRes<WriteZero, []>;
// Model the effect of clobbering the read-write mask operand of the GATHER operation.
// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
defm : X86WriteRes<WriteVecMaskedGatherWriteback, [], 5, [], 0>;
// Arithmetic.
defm : HWWriteResPair<WriteALU, [HWPort0156], 1>;
defm : HWWriteResPair<WriteADC, [HWPort06, HWPort0156], 2, [1,1], 2>;

View File

@ -112,6 +112,7 @@ def : WriteRes<WriteStoreNT, [SBPort23, SBPort4]>;
def : WriteRes<WriteLoad, [SBPort23]> { let Latency = 5; }
def : WriteRes<WriteMove, [SBPort015]>;
def : WriteRes<WriteZero, []>;
def : WriteRes<WriteVecMaskedGatherWriteback, []> { let Latency = 5; let NumMicroOps = 0; }
// Arithmetic.
defm : SBWriteResPair<WriteALU, [SBPort015], 1>;

View File

@ -203,6 +203,10 @@ defm : X86WriteRes<WriteStore, [SKLPort237, SKLPort4], 1, [1,1], 1>;
defm : X86WriteRes<WriteStoreNT, [SKLPort237, SKLPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteMove, [SKLPort0156], 1, [1], 1>;
// Model the effect of clobbering the read-write mask operand of the GATHER operation.
// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
defm : X86WriteRes<WriteVecMaskedGatherWriteback, [], 5, [], 0>;
// Idioms that clear a register, like xorps %xmm0, %xmm0.
// These can often bypass execution ports completely.
def : WriteRes<WriteZero, []>;

View File

@ -203,6 +203,10 @@ defm : X86WriteRes<WriteStore, [SKXPort237, SKXPort4], 1, [1,1], 1>;
defm : X86WriteRes<WriteStoreNT, [SKXPort237, SKXPort4], 1, [1,1], 2>;
defm : X86WriteRes<WriteMove, [SKXPort0156], 1, [1], 1>;
// Model the effect of clobbering the read-write mask operand of the GATHER operation.
// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
defm : X86WriteRes<WriteVecMaskedGatherWriteback, [], 5, [], 0>;
// Idioms that clear a register, like xorps %xmm0, %xmm0.
// These can often bypass execution ports completely.
def : WriteRes<WriteZero, []>;

View File

@ -125,6 +125,7 @@ def WriteLoad : SchedWrite;
def WriteStore : SchedWrite;
def WriteStoreNT : SchedWrite;
def WriteMove : SchedWrite;
def WriteVecMaskedGatherWriteback : SchedWrite;
def WriteCopy : WriteSequence<[WriteLoad, WriteStore]>; // mem->mem copy
// Arithmetic.

View File

@ -165,6 +165,7 @@ def : WriteRes<WriteLoad, [AtomPort0]>;
def : WriteRes<WriteStore, [AtomPort0]>;
def : WriteRes<WriteStoreNT, [AtomPort0]>;
def : WriteRes<WriteMove, [AtomPort01]>;
defm : X86WriteResUnsupported<WriteVecMaskedGatherWriteback>;
// Treat misc copies as a move.
def : InstRW<[WriteMove], (instrs COPY)>;

View File

@ -269,6 +269,7 @@ def : WriteRes<WriteLoad, [PdLoad]> { let Latency = 5; let ResourceCycles = [
def : WriteRes<WriteStore, [PdStore]>;
def : WriteRes<WriteStoreNT, [PdStore]>;
def : WriteRes<WriteMove, [PdEX01]> { let ResourceCycles = [2]; }
defm : X86WriteResUnsupported<WriteVecMaskedGatherWriteback>;
// Load/store MXCSR.
// FIXME: These are copy and pasted from WriteLoad/Store.

View File

@ -273,6 +273,7 @@ def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 3; }
def : WriteRes<WriteStore, [JSAGU]>;
def : WriteRes<WriteStoreNT, [JSAGU]>;
def : WriteRes<WriteMove, [JALU01]>;
defm : X86WriteResUnsupported<WriteVecMaskedGatherWriteback>;
// Load/store MXCSR.
def : WriteRes<WriteLDMXCSR, [JLAGU]> { let Latency = 3; }

View File

@ -88,6 +88,7 @@ def : WriteRes<WriteStoreNT, [SLM_IEC_RSV01, SLM_MEC_RSV]>;
def : WriteRes<WriteLoad, [SLM_MEC_RSV]> { let Latency = 3; }
def : WriteRes<WriteMove, [SLM_IEC_RSV01]>;
def : WriteRes<WriteZero, []>;
defm : X86WriteResUnsupported<WriteVecMaskedGatherWriteback>;
// Load/store MXCSR.
// FIXME: These are probably wrong. They are copy pasted from WriteStore/Load.

View File

@ -179,6 +179,10 @@ def : WriteRes<WriteStoreNT, [ZnAGU]>;
def : WriteRes<WriteMove, [ZnALU]>;
def : WriteRes<WriteLoad, [ZnAGU]> { let Latency = 8; }
// Model the effect of clobbering the read-write mask operand of the GATHER operation.
// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
def : WriteRes<WriteVecMaskedGatherWriteback, []> { let Latency = 8; let NumMicroOps = 0; }
def : WriteRes<WriteZero, []>;
def : WriteRes<WriteLEA, [ZnALU]>;
defm : ZnWriteResPair<WriteALU, [ZnALU], 1>;

View File

@ -178,6 +178,10 @@ def : WriteRes<WriteStoreNT, [Zn2AGU]>;
def : WriteRes<WriteMove, [Zn2ALU]>;
def : WriteRes<WriteLoad, [Zn2AGU]> { let Latency = 8; }
// Model the effect of clobbering the read-write mask operand of the GATHER operation.
// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
def : WriteRes<WriteVecMaskedGatherWriteback, []> { let Latency = 8; let NumMicroOps = 0; }
def : WriteRes<WriteZero, []>;
def : WriteRes<WriteLEA, [Zn2ALU]>;
defm : Zn2WriteResPair<WriteALU, [Zn2ALU], 1>;

View File

@ -495,6 +495,10 @@ defm : Zn3WriteResInt<WriteRMW, [Zn3AGU012, Zn3Store], Znver3Model.StoreLatency,
// Loads, stores, and moves, not folded with other operations.
defm : Zn3WriteResInt<WriteLoad, [Zn3AGU012, Zn3Load], !add(Znver3Model.LoadLatency, 1), [1, 1], 1>;
// Model the effect of clobbering the read-write mask operand of the GATHER operation.
// Does not cost anything by itself, only has latency, matching that of the WriteLoad,
defm : Zn3WriteResInt<WriteVecMaskedGatherWriteback, [], !add(Znver3Model.LoadLatency, 1), [], 0>;
def Zn3WriteMOVSlow : SchedWriteRes<[Zn3AGU012, Zn3Load]> {
let Latency = !add(Znver3Model.LoadLatency, 1);
let ResourceCycles = [3, 1];