From 92d89e3072b96b02f677aaaac8f9e9bec657a208 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 27 Mar 2018 20:38:54 +0000 Subject: [PATCH] [X86] Add WriteFMOVMSK/WriteVecMOVMSK/WriteMMXMOVMSK scheduler classes Currently MOVMSK instructions use the WriteVecLogic class, which is a very poor choice given that MOVMSK involves a SSE->GPR transfer. Differential Revision: https://reviews.llvm.org/D44924 llvm-svn: 328664 --- lib/Target/X86/X86InstrMMX.td | 2 +- lib/Target/X86/X86InstrSSE.td | 8 ++++---- lib/Target/X86/X86SchedBroadwell.td | 15 +++++---------- lib/Target/X86/X86SchedHaswell.td | 15 +++++---------- lib/Target/X86/X86SchedSandyBridge.td | 10 ++++++---- lib/Target/X86/X86SchedSkylakeClient.td | 9 +++++---- lib/Target/X86/X86SchedSkylakeServer.td | 15 +++++---------- lib/Target/X86/X86Schedule.td | 5 +++++ lib/Target/X86/X86ScheduleBtVer2.td | 15 ++++++++------- lib/Target/X86/X86ScheduleSLM.td | 5 +++++ lib/Target/X86/X86ScheduleZnver1.td | 18 ++++++++---------- test/CodeGen/X86/avx2-schedule.ll | 4 ++-- test/CodeGen/X86/mmx-schedule.ll | 2 +- test/CodeGen/X86/sse-schedule.ll | 2 +- test/CodeGen/X86/sse2-schedule.ll | 4 ++-- 15 files changed, 63 insertions(+), 66 deletions(-) diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index 79b58ae0e25..5f08c94cb12 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -635,7 +635,7 @@ def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), "pmovmskb\t{$src, $dst|$dst, $src}", [(set GR32orGR64:$dst, (int_x86_mmx_pmovmskb VR64:$src))], - IIC_MMX_MOVMSK>, Sched<[WriteVecLogic]>; + IIC_MMX_MOVMSK>, Sched<[WriteMMXMOVMSK]>; // Low word of XMM to MMX. def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1, diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 5ad23994152..127509beab1 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -2593,7 +2593,7 @@ multiclass sse12_extr_sign_mask, - Sched<[WriteVecLogic]>; + Sched<[WriteFMOVMSK]>; } let Predicates = [HasAVX] in { @@ -4271,7 +4271,7 @@ defm PINSRW : sse2_pinsrw, PD; // SSE2 - Packed Mask Creation //===---------------------------------------------------------------------===// -let ExeDomain = SSEPackedInt, SchedRW = [WriteVecLogic] in { +let ExeDomain = SSEPackedInt, SchedRW = [WriteVecMOVMSK] in { def VPMOVMSKBrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), @@ -4283,8 +4283,8 @@ let Predicates = [HasAVX2] in { def VPMOVMSKBYrr : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR256:$src), "pmovmskb\t{$src, $dst|$dst, $src}", - [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>, - VEX, VEX_L, VEX_WIG; + [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))], + IIC_SSE_MOVMSK>, VEX, VEX_L, VEX_WIG; } def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src), diff --git a/lib/Target/X86/X86SchedBroadwell.td b/lib/Target/X86/X86SchedBroadwell.td index 71eb873c5ac..db7ae8a6fa6 100755 --- a/lib/Target/X86/X86SchedBroadwell.td +++ b/lib/Target/X86/X86SchedBroadwell.td @@ -227,6 +227,11 @@ def : WriteRes { let ResourceCycles = [4,3,1,1]; } +// MOVMSK Instructions. +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 1; } + // AES instructions. def : WriteRes { // Decryption, encryption. let Latency = 7; @@ -297,7 +302,6 @@ def BWWriteResGroup1 : SchedWriteRes<[BWPort0]> { } def: InstRW<[BWWriteResGroup1], (instregex "MMX_MOVD64from64rr", "MMX_MOVD64grr", - "MMX_PMOVMSKBrr", "MMX_PSLLDri", "MMX_PSLLDrr", "MMX_PSLLQri", @@ -839,15 +843,6 @@ def: InstRW<[BWWriteResGroup25], (instregex "PUSH(16|32|64)rmr", "STOSQ", "STOSW")>; -def BWWriteResGroup26 : SchedWriteRes<[BWPort0]> { - let Latency = 3; - let NumMicroOps = 1; - let ResourceCycles = [1]; -} -def: InstRW<[BWWriteResGroup26], (instregex "(V?)MOVMSKPD(Y?)rr", - "(V?)MOVMSKPS(Y?)rr", - "(V?)PMOVMSKB(Y?)rr")>; - def BWWriteResGroup27 : SchedWriteRes<[BWPort1]> { let Latency = 3; let NumMicroOps = 1; diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td index e2ea697b47f..97825257cb7 100644 --- a/lib/Target/X86/X86SchedHaswell.td +++ b/lib/Target/X86/X86SchedHaswell.td @@ -216,6 +216,11 @@ def : WriteRes { let ResourceCycles = [4,3,1,1]; } +// MOVMSK Instructions. +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 1; } + // AES Instructions. def : WriteRes { let Latency = 7; @@ -658,7 +663,6 @@ def HWWriteResGroup2 : SchedWriteRes<[HWPort0]> { } def: InstRW<[HWWriteResGroup2], (instregex "MMX_MOVD64from64rr", "MMX_MOVD64grr", - "MMX_PMOVMSKBrr", "MMX_PSLLDri", "MMX_PSLLDrr", "MMX_PSLLQri", @@ -1763,15 +1767,6 @@ def HWWriteResGroup48 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort def: InstRW<[HWWriteResGroup48], (instregex "CALL(16|32|64)m", "FARCALL64")>; -def HWWriteResGroup49 : SchedWriteRes<[HWPort0]> { - let Latency = 3; - let NumMicroOps = 1; - let ResourceCycles = [1]; -} -def: InstRW<[HWWriteResGroup49], (instregex "(V?)MOVMSKPD(Y?)rr", - "(V?)MOVMSKPS(Y?)rr", - "(V?)PMOVMSKB(Y?)rr")>; - def HWWriteResGroup50 : SchedWriteRes<[HWPort1]> { let Latency = 3; let NumMicroOps = 1; diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td index 31478380ebf..e5fc16844bf 100644 --- a/lib/Target/X86/X86SchedSandyBridge.td +++ b/lib/Target/X86/X86SchedSandyBridge.td @@ -204,6 +204,11 @@ def : WriteRes { let ResourceCycles = [7, 1]; } +// MOVMSK Instructions. +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 1; } + // AES Instructions. def : WriteRes { let Latency = 7; @@ -527,10 +532,7 @@ def SBWriteResGroup7 : SchedWriteRes<[SBPort0]> { let NumMicroOps = 1; let ResourceCycles = [1]; } -def: InstRW<[SBWriteResGroup7], (instregex "(V?)PMOVMSKBrr", - "(V?)MOVMSKPD(Y?)rr", - "(V?)MOVMSKPS(Y?)rr", - "(V?)MOVPDI2DIrr", +def: InstRW<[SBWriteResGroup7], (instregex "(V?)MOVPDI2DIrr", "(V?)MOVPQIto64rr")>; def SBWriteResGroup9 : SchedWriteRes<[SBPort05]> { diff --git a/lib/Target/X86/X86SchedSkylakeClient.td b/lib/Target/X86/X86SchedSkylakeClient.td index 2a16982346c..bceb43541d6 100644 --- a/lib/Target/X86/X86SchedSkylakeClient.td +++ b/lib/Target/X86/X86SchedSkylakeClient.td @@ -224,6 +224,11 @@ def : WriteRes { let ResourceCycles = [4,3,1,1]; } +// MOVMSK Instructions. +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 2; } + // AES instructions. def : WriteRes { // Decryption, encryption. let Latency = 4; @@ -692,14 +697,10 @@ def SKLWriteResGroup12 : SchedWriteRes<[SKLPort0]> { } def: InstRW<[SKLWriteResGroup12], (instregex "MMX_MOVD64from64rr", "MMX_MOVD64grr", - "MMX_PMOVMSKBrr", "(V?)COMISDrr", "(V?)COMISSrr", - "(V?)MOVMSKPD(Y?)rr", - "(V?)MOVMSKPS(Y?)rr", "(V?)MOVPDI2DIrr", "(V?)MOVPQIto64rr", - "(V?)PMOVMSKB(Y?)rr", "VTESTPD(Y?)rr", "VTESTPS(Y?)rr", "(V?)UCOMISDrr", diff --git a/lib/Target/X86/X86SchedSkylakeServer.td b/lib/Target/X86/X86SchedSkylakeServer.td index c9a9c60ffc0..8bfd2131bf0 100755 --- a/lib/Target/X86/X86SchedSkylakeServer.td +++ b/lib/Target/X86/X86SchedSkylakeServer.td @@ -224,6 +224,11 @@ def : WriteRes { let ResourceCycles = [4,3,1,1]; } +// MOVMSK Instructions. +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 2; } + // AES instructions. def : WriteRes { // Decryption, encryption. let Latency = 4; @@ -1444,28 +1449,18 @@ def: InstRW<[SKXWriteResGroup12], (instregex "COMISDrr", "COMISSrr", "MMX_MOVD64from64rr", "MMX_MOVD64grr", - "MMX_PMOVMSKBrr", - "MOVMSKPDrr", - "MOVMSKPSrr", "MOVPDI2DIrr", "MOVPQIto64rr", - "PMOVMSKBrr", "UCOMISDrr", "UCOMISSrr", "VCOMISDZrr(b?)", "VCOMISDrr", "VCOMISSZrr(b?)", "VCOMISSrr", - "VMOVMSKPDYrr", - "VMOVMSKPDrr", - "VMOVMSKPSYrr", - "VMOVMSKPSrr", "VMOVPDI2DIZrr(b?)(k?)(z?)", "VMOVPDI2DIrr", "VMOVPQIto64Zrr(b?)(k?)(z?)", "VMOVPQIto64rr", - "VPMOVMSKBYrr", - "VPMOVMSKBrr", "VTESTPDYrr", "VTESTPDrr", "VTESTPSYrr", diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td index 85ca7a9b0ad..6136d96fcfb 100644 --- a/lib/Target/X86/X86Schedule.td +++ b/lib/Target/X86/X86Schedule.td @@ -105,6 +105,11 @@ defm WriteMPSAD : X86SchedWritePair; // Vector MPSAD. // These are often used on both floating point and integer vectors. defm WriteVecLogic : X86SchedWritePair; // Vector and/or/xor. +// MOVMSK operations. +def WriteFMOVMSK : SchedWrite; +def WriteVecMOVMSK : SchedWrite; +def WriteMMXMOVMSK : SchedWrite; + // Conversion between integer and float. defm WriteCvtF2I : X86SchedWritePair; // Float -> Integer. defm WriteCvtI2F : X86SchedWritePair; // Integer -> Float. diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td index af379354d3b..4d8838372fe 100644 --- a/lib/Target/X86/X86ScheduleBtVer2.td +++ b/lib/Target/X86/X86ScheduleBtVer2.td @@ -358,6 +358,14 @@ defm : JWriteResFpuPair; defm : JWriteResFpuPair; +//////////////////////////////////////////////////////////////////////////////// +// MOVMSK Instructions. +//////////////////////////////////////////////////////////////////////////////// + +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 3; } + //////////////////////////////////////////////////////////////////////////////// // AES Instructions. //////////////////////////////////////////////////////////////////////////////// @@ -771,13 +779,6 @@ def JWriteVMaskMovYSt: SchedWriteRes<[JFPU01, JFPX, JSAGU]> { } def : InstRW<[JWriteVMaskMovYSt], (instrs VMASKMOVPDYmr, VMASKMOVPSYmr)>; -def JWriteVMOVMSK: SchedWriteRes<[JFPU0, JFPA, JALU0]> { - let Latency = 3; -} -def : InstRW<[JWriteVMOVMSK], (instrs MOVMSKPDrr, VMOVMSKPDrr, VMOVMSKPDYrr, - MOVMSKPSrr, VMOVMSKPSrr, VMOVMSKPSYrr, - PMOVMSKBrr, VPMOVMSKBrr, MMX_PMOVMSKBrr)>; - def JWriteVTESTY: SchedWriteRes<[JFPU01, JFPX, JFPA, JALU0]> { let Latency = 4; let ResourceCycles = [2, 2, 2, 1]; diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td index 6fb7e9480c9..87b1bf26c6e 100644 --- a/lib/Target/X86/X86ScheduleSLM.td +++ b/lib/Target/X86/X86ScheduleSLM.td @@ -185,6 +185,11 @@ def : WriteRes { let ResourceCycles = [21, 1]; } +// MOVMSK Instructions. +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } + // AES Instructions. def : WriteRes { let Latency = 8; diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td index 5459485b398..252243c6a7a 100644 --- a/lib/Target/X86/X86ScheduleZnver1.td +++ b/lib/Target/X86/X86ScheduleZnver1.td @@ -216,6 +216,11 @@ defm : ZnWriteResFpuPair; // Vector Shift Operations defm : ZnWriteResFpuPair; +// MOVMSK Instructions. +def : WriteRes; +def : WriteRes; +def : WriteRes; + // AES Instructions. defm : ZnWriteResFpuPair; defm : ZnWriteResFpuPair; @@ -1004,14 +1009,12 @@ def : InstRW<[WriteMicrocoded], // m, v,v. def : InstRW<[WriteMicrocoded], (instregex "VPMASKMOV(D|Q)(Y?)mr")>; -// PMOVMSKB. -def ZnWritePMOVMSKB : SchedWriteRes<[ZnFPU2]> { - let NumMicroOps = 2; -} +// PMOVMSKBY. def ZnWritePMOVMSKBY : SchedWriteRes<[ZnFPU2]> { + let NumMicroOps = 2; let Latency = 2; + let ResourceCycles = [2]; } -def : InstRW<[ZnWritePMOVMSKB], (instregex "(V|MMX_)?PMOVMSKBrr")>; def : InstRW<[ZnWritePMOVMSKBY], (instregex "(V|MMX_)?PMOVMSKBYrr")>; // PEXTR B/W/D/Q. @@ -1150,11 +1153,6 @@ def : InstRW<[ZnWritePShiftY], (instregex "(V?)PS(R|L)LDQYri")>; //=== Floating Point XMM and YMM Instructions ===// //-- Move instructions --// -// MOVMSKP S/D. -// r32 <- x,y. -def ZnWriteMOVMSKPr : SchedWriteRes<[ZnFPU2]> ; -def : InstRW<[ZnWriteMOVMSKPr], (instregex "(V?)MOVMSKP(S|D)(Y?)rr")>; - // VPERM2F128. def : InstRW<[WriteMicrocoded], (instregex "VPERM2F128rr")>; def : InstRW<[WriteMicrocoded], (instregex "VPERM2F128rm")>; diff --git a/test/CodeGen/X86/avx2-schedule.ll b/test/CodeGen/X86/avx2-schedule.ll index 600a20892cc..f697a649083 100644 --- a/test/CodeGen/X86/avx2-schedule.ll +++ b/test/CodeGen/X86/avx2-schedule.ll @@ -4096,7 +4096,7 @@ declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readn define i32 @test_pmovmskb(<32 x i8> %a0) { ; GENERIC-LABEL: test_pmovmskb: ; GENERIC: # %bb.0: -; GENERIC-NEXT: vpmovmskb %ymm0, %eax # sched: [1:1.00] +; GENERIC-NEXT: vpmovmskb %ymm0, %eax # sched: [2:1.00] ; GENERIC-NEXT: vzeroupper # sched: [100:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -4126,7 +4126,7 @@ define i32 @test_pmovmskb(<32 x i8> %a0) { ; ; ZNVER1-LABEL: test_pmovmskb: ; ZNVER1: # %bb.0: -; ZNVER1-NEXT: vpmovmskb %ymm0, %eax # sched: [2:1.00] +; ZNVER1-NEXT: vpmovmskb %ymm0, %eax # sched: [2:2.00] ; ZNVER1-NEXT: vzeroupper # sched: [100:?] ; ZNVER1-NEXT: retq # sched: [1:0.50] %1 = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %a0) diff --git a/test/CodeGen/X86/mmx-schedule.ll b/test/CodeGen/X86/mmx-schedule.ll index 1149ed0158d..ad76dd5cc0d 100644 --- a/test/CodeGen/X86/mmx-schedule.ll +++ b/test/CodeGen/X86/mmx-schedule.ll @@ -4068,7 +4068,7 @@ define i32 @test_pmovmskb(x86_mmx %a0) optsize { ; ; SLM-LABEL: test_pmovmskb: ; SLM: # %bb.0: -; SLM-NEXT: pmovmskb %mm0, %eax # sched: [1:0.50] +; SLM-NEXT: pmovmskb %mm0, %eax # sched: [4:1.00] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-LABEL: test_pmovmskb: diff --git a/test/CodeGen/X86/sse-schedule.ll b/test/CodeGen/X86/sse-schedule.ll index 2d578b533ed..7ed52f03339 100644 --- a/test/CodeGen/X86/sse-schedule.ll +++ b/test/CodeGen/X86/sse-schedule.ll @@ -3098,7 +3098,7 @@ define i32 @test_movmskps(<4 x float> %a0) { ; ; SLM-LABEL: test_movmskps: ; SLM: # %bb.0: -; SLM-NEXT: movmskps %xmm0, %eax # sched: [1:0.50] +; SLM-NEXT: movmskps %xmm0, %eax # sched: [4:1.00] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-SSE-LABEL: test_movmskps: diff --git a/test/CodeGen/X86/sse2-schedule.ll b/test/CodeGen/X86/sse2-schedule.ll index 3724de9f2d0..904289da0d6 100644 --- a/test/CodeGen/X86/sse2-schedule.ll +++ b/test/CodeGen/X86/sse2-schedule.ll @@ -5107,7 +5107,7 @@ define i32 @test_movmskpd(<2 x double> %a0) { ; ; SLM-LABEL: test_movmskpd: ; SLM: # %bb.0: -; SLM-NEXT: movmskpd %xmm0, %eax # sched: [1:0.50] +; SLM-NEXT: movmskpd %xmm0, %eax # sched: [4:1.00] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-SSE-LABEL: test_movmskpd: @@ -9684,7 +9684,7 @@ define i32 @test_pmovmskb(<16 x i8> %a0) { ; ; SLM-LABEL: test_pmovmskb: ; SLM: # %bb.0: -; SLM-NEXT: pmovmskb %xmm0, %eax # sched: [1:0.50] +; SLM-NEXT: pmovmskb %xmm0, %eax # sched: [4:1.00] ; SLM-NEXT: retq # sched: [4:1.00] ; ; SANDY-SSE-LABEL: test_pmovmskb: