From 96c5b6d4fadec40722b50fd9a94ed599956bd67d Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Mon, 1 Oct 2018 10:35:13 +0000 Subject: [PATCH] [X86][BtVer2] Teach how to identify zero-idiom VPERM2F128rr instructions. This patch adds another variant class to identify zero-idiom VPERM2F128rr instructions. On Jaguar, a VPERM wih bit 3 and 7 of the mask set, is a zero-idiom. Differential Revision: https://reviews.llvm.org/D52663 llvm-svn: 343452 --- lib/Target/X86/X86SchedPredicates.td | 7 +++++ lib/Target/X86/X86ScheduleBtVer2.td | 10 ++++++- .../llvm-mca/X86/BtVer2/zero-idioms-avx-256.s | 30 +++++++++---------- 3 files changed, 31 insertions(+), 16 deletions(-) diff --git a/lib/Target/X86/X86SchedPredicates.td b/lib/Target/X86/X86SchedPredicates.td index 11b567c18cf..1c7f24375f6 100644 --- a/lib/Target/X86/X86SchedPredicates.td +++ b/lib/Target/X86/X86SchedPredicates.td @@ -19,6 +19,13 @@ // different zero-idioms. def ZeroIdiomPredicate : CheckSameRegOperand<1, 2>; +// A predicate used to identify VPERM that have bits 3 and 7 of their mask set. +// On some processors, these VPERM instructions are zero-idioms. +def ZeroIdiomVPERMPredicate : CheckAll<[ + ZeroIdiomPredicate, + CheckImmOperand<3, 0x88> +]>; + // A predicate used to check if a LEA instruction uses all three source // operands: base, index, and offset. def IsThreeOperandsLEAPredicate: CheckAll<[ diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td index 2729e7f8e4e..9df0c779264 100644 --- a/lib/Target/X86/X86ScheduleBtVer2.td +++ b/lib/Target/X86/X86ScheduleBtVer2.td @@ -688,6 +688,12 @@ def : InstRW<[JWriteVZeroIdiomALUX], (instrs PSUBBrr, VPSUBBrr, PCMPGTQrr, VPCMPGTQrr, PCMPGTWrr, VPCMPGTWrr)>; +def JWriteVPERM2F128 : SchedWriteVariant<[ + SchedVar, [JWriteZeroIdiomYmm]>, + SchedVar +]>; +def : InstRW<[JWriteVPERM2F128], (instrs VPERM2F128rr)>; + // This write is used for slow LEA instructions. def JWrite3OpsLEA : SchedWriteRes<[JALU1, JSAGU]> { let Latency = 2; @@ -762,7 +768,9 @@ def : IsZeroIdiomFunction<[ // ymm variants. VXORPSYrr, VXORPDYrr, VANDNPSYrr, VANDNPDYrr - ], ZeroIdiomPredicate> + ], ZeroIdiomPredicate>, + + DepBreakingClass<[ VPERM2F128rr ], ZeroIdiomVPERMPredicate> ]>; def : IsDepBreakingFunction<[ diff --git a/test/tools/llvm-mca/X86/BtVer2/zero-idioms-avx-256.s b/test/tools/llvm-mca/X86/BtVer2/zero-idioms-avx-256.s index 7600368c7c0..b1669f33405 100644 --- a/test/tools/llvm-mca/X86/BtVer2/zero-idioms-avx-256.s +++ b/test/tools/llvm-mca/X86/BtVer2/zero-idioms-avx-256.s @@ -330,12 +330,12 @@ vaddps %ymm1, %ymm1, %ymm0 # CHECK: Iterations: 100 # CHECK-NEXT: Instructions: 200 -# CHECK-NEXT: Total Cycles: 403 +# CHECK-NEXT: Total Cycles: 205 # CHECK-NEXT: Total uOps: 400 # CHECK: Dispatch Width: 2 -# CHECK-NEXT: uOps Per Cycle: 0.99 -# CHECK-NEXT: IPC: 0.50 +# CHECK-NEXT: uOps Per Cycle: 1.95 +# CHECK-NEXT: IPC: 0.98 # CHECK-NEXT: Block RThroughput: 2.0 # CHECK: Instruction Info: @@ -347,7 +347,7 @@ vaddps %ymm1, %ymm1, %ymm0 # CHECK-NEXT: [6]: HasSideEffects (U) # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 2 1 1.00 vperm2f128 $136, %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: 2 1 0.50 vperm2f128 $136, %ymm0, %ymm0, %ymm1 # CHECK-NEXT: 2 3 2.00 vaddps %ymm1, %ymm1, %ymm0 # CHECK: Resources: @@ -368,23 +368,23 @@ vaddps %ymm1, %ymm1, %ymm0 # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] -# CHECK-NEXT: - - - 2.00 2.00 2.00 2.00 - - - - - - - +# CHECK-NEXT: - - - 2.00 1.00 2.00 1.00 - - - - - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: -# CHECK-NEXT: - - - - 2.00 - 2.00 - - - - - - - vperm2f128 $136, %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: - - - - 1.00 - 1.00 - - - - - - - vperm2f128 $136, %ymm0, %ymm0, %ymm1 # CHECK-NEXT: - - - 2.00 - 2.00 - - - - - - - - vaddps %ymm1, %ymm1, %ymm0 # CHECK: Timeline view: -# CHECK-NEXT: 01234 +# CHECK-NEXT: 0 # CHECK-NEXT: Index 0123456789 -# CHECK: [0,0] DeER . . . vperm2f128 $136, %ymm0, %ymm0, %ymm1 -# CHECK-NEXT: [0,1] .DeeeER . . vaddps %ymm1, %ymm1, %ymm0 -# CHECK-NEXT: [1,0] . D==eER . . vperm2f128 $136, %ymm0, %ymm0, %ymm1 -# CHECK-NEXT: [1,1] . D==eeeER . vaddps %ymm1, %ymm1, %ymm0 -# CHECK-NEXT: [2,0] . D====eER . vperm2f128 $136, %ymm0, %ymm0, %ymm1 -# CHECK-NEXT: [2,1] . D====eeeER vaddps %ymm1, %ymm1, %ymm0 +# CHECK: [0,0] DeER . . vperm2f128 $136, %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: [0,1] .DeeeER . vaddps %ymm1, %ymm1, %ymm0 +# CHECK-NEXT: [1,0] . DeE-R . vperm2f128 $136, %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: [1,1] . DeeeER . vaddps %ymm1, %ymm1, %ymm0 +# CHECK-NEXT: [2,0] . DeE-R . vperm2f128 $136, %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: [2,1] . DeeeER vaddps %ymm1, %ymm1, %ymm0 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -393,5 +393,5 @@ vaddps %ymm1, %ymm1, %ymm0 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 3 3.0 0.3 0.0 vperm2f128 $136, %ymm0, %ymm0, %ymm1 -# CHECK-NEXT: 1. 3 3.0 0.0 0.0 vaddps %ymm1, %ymm1, %ymm0 +# CHECK-NEXT: 0. 3 1.0 1.0 0.7 vperm2f128 $136, %ymm0, %ymm0, %ymm1 +# CHECK-NEXT: 1. 3 1.0 0.0 0.0 vaddps %ymm1, %ymm1, %ymm0