1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 19:23:23 +01:00

AMD znver1 Initial Scheduler model

Summary:
This patch adds the following
1. Adds a skeleton scheduler model for AMD Znver1.
2. Introduces the znver1 execution units and pipes.
3. Caters the instructions based on the generic scheduler classes.
4. Further additions to the scheduler model with instruction itineraries will be carried out incrementally based on
        a. Instructions types
        b. Registers used
5. Since itineraries are not added based on instructions, throughput information are bound to change when incremental changes are added.
6. Scheduler testcases are modified accordingly to suit the new model.

Patch by Ganesh Gopalasubramanian. With minor formatting tweaks from me.

Reviewers: craig.topper, RKSimon

Subscribers: javed.absar, shivaram, ddibyend, vprasad

Differential Revision: https://reviews.llvm.org/D35293

llvm-svn: 308411
This commit is contained in:
Craig Topper 2017-07-19 02:45:14 +00:00
parent 72231eb582
commit 66c70e9248
19 changed files with 2570 additions and 353 deletions

View File

@ -814,10 +814,8 @@ def : Proc<"bdver4", [
FeatureMWAITX
]>;
// TODO: The scheduler model falls to BTVER2 model.
// The znver1 model has to be put in place.
// Zen
def: ProcessorModel<"znver1", BtVer2Model, [
// Znver1
def: ProcessorModel<"znver1", Znver1Model, [
FeatureADX,
FeatureAES,
FeatureAVX2,

View File

@ -663,5 +663,6 @@ include "X86ScheduleAtom.td"
include "X86SchedSandyBridge.td"
include "X86SchedHaswell.td"
include "X86ScheduleSLM.td"
include "X86ScheduleZnver1.td"
include "X86ScheduleBtVer2.td"

View File

@ -0,0 +1,223 @@
//=- X86ScheduleZnver1.td - X86 Znver1 Scheduling -------------*- tablegen -*-=//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines the machine model for Znver1 to support instruction
// scheduling and other instruction cost heuristics.
//
//===----------------------------------------------------------------------===//
def Znver1Model : SchedMachineModel {
// Zen can decode 4 instructions per cycle.
let IssueWidth = 4;
// Based on the reorder buffer we define MicroOpBufferSize
let MicroOpBufferSize = 192;
let LoadLatency = 4;
let MispredictPenalty = 17;
let HighLatency = 25;
let PostRAScheduler = 1;
// FIXME: This variable is required for incomplete model.
// We haven't catered all instructions.
// So, we reset the value of this variable so as to
// say that the model is incomplete.
let CompleteModel = 0;
}
let SchedModel = Znver1Model in {
// Zen can issue micro-ops to 10 different units in one cycle.
// These are
// * Four integer ALU units (ZALU0, ZALU1, ZALU2, ZALU3)
// * Two AGU units (ZAGU0, ZAGU1)
// * Four FPU units (ZFPU0, ZFPU1, ZFPU2, ZFPU3)
// AGUs feed load store queues @two loads and 1 store per cycle.
// Four ALU units are defined below
def ZnALU0 : ProcResource<1>;
def ZnALU1 : ProcResource<1>;
def ZnALU2 : ProcResource<1>;
def ZnALU3 : ProcResource<1>;
// Two AGU units are defined below
def ZnAGU0 : ProcResource<1>;
def ZnAGU1 : ProcResource<1>;
// Four FPU units are defined below
def ZnFPU0 : ProcResource<1>;
def ZnFPU1 : ProcResource<1>;
def ZnFPU2 : ProcResource<1>;
def ZnFPU3 : ProcResource<1>;
// FPU grouping
def ZnFPU : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU2, ZnFPU3]>;
def ZnFPU013 : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU3]>;
def ZnFPU01 : ProcResGroup<[ZnFPU0, ZnFPU1]>;
def ZnFPU12 : ProcResGroup<[ZnFPU1, ZnFPU2]>;
def ZnFPU13 : ProcResGroup<[ZnFPU1, ZnFPU3]>;
def ZnFPU23 : ProcResGroup<[ZnFPU2, ZnFPU3]>;
def ZnFPU02 : ProcResGroup<[ZnFPU0, ZnFPU2]>;
def ZnFPU03 : ProcResGroup<[ZnFPU0, ZnFPU3]>;
// Below are the grouping of the units.
// Micro-ops to be issued to multiple units are tackled this way.
// ALU grouping
// ZnALU03 - 0,3 grouping
def ZnALU03: ProcResGroup<[ZnALU0, ZnALU3]>;
// 56 Entry (14x4 entries) Int Scheduler
def ZnALU : ProcResGroup<[ZnALU0, ZnALU1, ZnALU2, ZnALU3]> {
let BufferSize=56;
}
// 28 Entry (14x2) AGU group. AGUs can't be used for all ALU operations
// but are relevant for some instructions
def ZnAGU : ProcResGroup<[ZnAGU0, ZnAGU1]> {
let BufferSize=28;
}
// Integer Multiplication issued on ALU1.
def ZnMultiplier : ProcResource<1>;
// Integer division issued on ALU2.
def ZnDivider : ProcResource<1>;
// 4 Cycles load-to use Latency is captured
def : ReadAdvance<ReadAfterLd, 4>;
// (a folded load is an instruction that loads and does some operation)
// Ex: ADDPD xmm,[mem]-> This instruction has two micro-ops
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops.
// a. load and
// b. addpd
// This multiclass is for folded loads for integer units.
multiclass ZnWriteResPair<X86FoldableSchedWrite SchedRW,
ProcResourceKind ExePort,
int Lat> {
// Register variant takes 1-cycle on Execution Port.
def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
// Memory variant also uses a cycle on ZnAGU
// adds 4 cycles to the latency.
def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> {
let Latency = !add(Lat, 4);
}
}
// This multiclass is for folded loads for floating point units.
multiclass ZnWriteResFpuPair<X86FoldableSchedWrite SchedRW,
ProcResourceKind ExePort,
int Lat> {
// Register variant takes 1-cycle on Execution Port.
def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
// Memory variant also uses a cycle on ZnAGU
// adds 7 cycles to the latency.
def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> {
let Latency = !add(Lat, 7);
}
}
// WriteRMW is set for instructions with Memory write
// operation in codegen
def : WriteRes<WriteRMW, [ZnAGU]>;
def : WriteRes<WriteStore, [ZnAGU]>;
def : WriteRes<WriteMove, [ZnALU]>;
def : WriteRes<WriteLoad, [ZnAGU]> { let Latency = 8; }
def : WriteRes<WriteZero, []>;
def : WriteRes<WriteLEA, [ZnALU]>;
defm : ZnWriteResPair<WriteALU, ZnALU, 1>;
defm : ZnWriteResPair<WriteShift, ZnALU, 1>;
defm : ZnWriteResPair<WriteJump, ZnALU, 1>;
// IDIV
def : WriteRes<WriteIDiv, [ZnALU2, ZnDivider]> {
let Latency = 41;
let ResourceCycles = [1, 41];
}
def : WriteRes<WriteIDivLd, [ZnALU2, ZnAGU, ZnDivider]> {
let Latency = 45;
let ResourceCycles = [1, 4, 41];
}
// IMUL
def : WriteRes<WriteIMulH, [ZnALU1, ZnMultiplier]>{
let Latency = 4;
}
def : WriteRes<WriteIMul, [ZnALU1, ZnMultiplier]> {
let Latency = 4;
}
def : WriteRes<WriteIMulLd,[ZnALU1, ZnMultiplier]> {
let Latency = 8;
}
// Floating point operations
defm : ZnWriteResFpuPair<WriteFHAdd, ZnFPU0, 3>;
defm : ZnWriteResFpuPair<WriteFAdd, ZnFPU0, 3>;
defm : ZnWriteResFpuPair<WriteFBlend, ZnFPU01, 1>;
defm : ZnWriteResFpuPair<WriteFVarBlend, ZnFPU01, 1>;
defm : ZnWriteResFpuPair<WriteVarBlend, ZnFPU0, 1>;
defm : ZnWriteResFpuPair<WriteCvtI2F, ZnFPU3, 5>;
defm : ZnWriteResFpuPair<WriteCvtF2F, ZnFPU3, 5>;
defm : ZnWriteResFpuPair<WriteCvtF2I, ZnFPU3, 5>;
defm : ZnWriteResFpuPair<WriteFDiv, ZnFPU3, 15>;
defm : ZnWriteResFpuPair<WriteFShuffle, ZnFPU12, 1>;
defm : ZnWriteResFpuPair<WriteFMul, ZnFPU0, 5>;
defm : ZnWriteResFpuPair<WriteFRcp, ZnFPU01, 5>;
defm : ZnWriteResFpuPair<WriteFRsqrt, ZnFPU01, 5>;
defm : ZnWriteResFpuPair<WriteFSqrt, ZnFPU3, 20>;
// Vector integer operations which uses FPU units
defm : ZnWriteResFpuPair<WriteVecShift, ZnFPU, 1>;
defm : ZnWriteResFpuPair<WriteVecLogic, ZnFPU, 1>;
defm : ZnWriteResFpuPair<WritePHAdd, ZnFPU, 1>;
defm : ZnWriteResFpuPair<WriteVecALU, ZnFPU, 1>;
defm : ZnWriteResFpuPair<WriteVecIMul, ZnFPU0, 4>;
defm : ZnWriteResFpuPair<WriteShuffle, ZnFPU, 1>;
defm : ZnWriteResFpuPair<WriteBlend, ZnFPU01, 1>;
defm : ZnWriteResFpuPair<WriteShuffle256, ZnFPU, 2>;
// Vector Shift Operations
defm : ZnWriteResFpuPair<WriteVarVecShift, ZnFPU12, 1>;
// AES Instructions.
defm : ZnWriteResFpuPair<WriteAESDecEnc, ZnFPU01, 4>;
defm : ZnWriteResFpuPair<WriteAESIMC, ZnFPU01, 4>;
defm : ZnWriteResFpuPair<WriteAESKeyGen, ZnFPU01, 4>;
def : WriteRes<WriteFence, [ZnAGU]>;
def : WriteRes<WriteNop, []>;
// Following instructions with latency=100 are microcoded.
// We set long latency so as to block the entire pipeline.
defm : ZnWriteResFpuPair<WriteFShuffle256, ZnFPU, 100>;
//Microcoded Instructions
let Latency = 100 in {
def : WriteRes<WriteMicrocoded, []>;
def : WriteRes<WriteSystem, []>;
def : WriteRes<WriteMPSAD, []>;
def : WriteRes<WriteMPSADLd, []>;
def : WriteRes<WriteCLMul, []>;
def : WriteRes<WriteCLMulLd, []>;
def : WriteRes<WritePCmpIStrM, []>;
def : WriteRes<WritePCmpIStrMLd, []>;
def : WriteRes<WritePCmpEStrI, []>;
def : WriteRes<WritePCmpEStrILd, []>;
def : WriteRes<WritePCmpEStrM, []>;
def : WriteRes<WritePCmpEStrMLd, []>;
def : WriteRes<WritePCmpIStrI, []>;
def : WriteRes<WritePCmpIStrILd, []>;
}
}

File diff suppressed because it is too large Load Diff

View File

@ -13,10 +13,10 @@ define <32 x i8> @test_pabsb(<32 x i8> %a0, <32 x i8> *%a1) {
;
; ZNVER1-LABEL: test_pabsb:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpabsb (%rdi), %ymm1 # sched: [6:1.00]
; ZNVER1-NEXT: vpabsb %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: vpabsb (%rdi), %ymm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpabsb %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %a0)
%2 = load <32 x i8>, <32 x i8> *%a1, align 32
%3 = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %2)
@ -35,10 +35,10 @@ define <8 x i32> @test_pabsd(<8 x i32> %a0, <8 x i32> *%a1) {
;
; ZNVER1-LABEL: test_pabsd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpabsd (%rdi), %ymm1 # sched: [6:1.00]
; ZNVER1-NEXT: vpabsd %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: vpabsd (%rdi), %ymm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpabsd %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %a0)
%2 = load <8 x i32>, <8 x i32> *%a1, align 32
%3 = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %2)
@ -57,10 +57,10 @@ define <16 x i16> @test_pabsw(<16 x i16> %a0, <16 x i16> *%a1) {
;
; ZNVER1-LABEL: test_pabsw:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpabsw (%rdi), %ymm1 # sched: [6:1.00]
; ZNVER1-NEXT: vpabsw %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: vpabsw (%rdi), %ymm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpabsw %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %a0)
%2 = load <16 x i16>, <16 x i16> *%a1, align 32
%3 = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %2)
@ -78,9 +78,9 @@ define <32 x i8> @test_paddb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
;
; ZNVER1-LABEL: test_paddb:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: vpaddb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = add <32 x i8> %a0, %a1
%2 = load <32 x i8>, <32 x i8> *%a2, align 32
%3 = add <32 x i8> %1, %2
@ -96,9 +96,9 @@ define <8 x i32> @test_paddd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
;
; ZNVER1-LABEL: test_paddd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = add <8 x i32> %a0, %a1
%2 = load <8 x i32>, <8 x i32> *%a2, align 32
%3 = add <8 x i32> %1, %2
@ -114,9 +114,9 @@ define <4 x i64> @test_paddq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
;
; ZNVER1-LABEL: test_paddq:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = add <4 x i64> %a0, %a1
%2 = load <4 x i64>, <4 x i64> *%a2, align 32
%3 = add <4 x i64> %1, %2
@ -132,9 +132,9 @@ define <16 x i16> @test_paddw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
;
; ZNVER1-LABEL: test_paddw:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = add <16 x i16> %a0, %a1
%2 = load <16 x i16>, <16 x i16> *%a2, align 32
%3 = add <16 x i16> %1, %2
@ -151,10 +151,10 @@ define <4 x i64> @test_pand(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
;
; ZNVER1-LABEL: test_pand:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: vpand %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpand (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = and <4 x i64> %a0, %a1
%2 = load <4 x i64>, <4 x i64> *%a2, align 32
%3 = and <4 x i64> %1, %2
@ -172,10 +172,10 @@ define <4 x i64> @test_pandn(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
;
; ZNVER1-LABEL: test_pandn:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [6:1.00]
; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: vpandn %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpandn (%rdi), %ymm0, %ymm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = xor <4 x i64> %a0, <i64 -1, i64 -1, i64 -1, i64 -1>
%2 = and <4 x i64> %a1, %1
%3 = load <4 x i64>, <4 x i64> *%a2, align 32
@ -194,9 +194,9 @@ define <8 x i32> @test_pmulld(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
;
; ZNVER1-LABEL: test_pmulld:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
; ZNVER1-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: vpmulld %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
; ZNVER1-NEXT: vpmulld (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = mul <8 x i32> %a0, %a1
%2 = load <8 x i32>, <8 x i32> *%a2, align 32
%3 = mul <8 x i32> %1, %2
@ -212,9 +212,9 @@ define <16 x i16> @test_pmullw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2)
;
; ZNVER1-LABEL: test_pmullw:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpmullw %ymm1, %ymm0, %ymm0 # sched: [2:1.00]
; ZNVER1-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [7:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: vpmullw %ymm1, %ymm0, %ymm0 # sched: [4:1.00]
; ZNVER1-NEXT: vpmullw (%rdi), %ymm0, %ymm0 # sched: [11:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = mul <16 x i16> %a0, %a1
%2 = load <16 x i16>, <16 x i16> *%a2, align 32
%3 = mul <16 x i16> %1, %2
@ -231,10 +231,10 @@ define <4 x i64> @test_por(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
;
; ZNVER1-LABEL: test_por:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: vpor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = or <4 x i64> %a0, %a1
%2 = load <4 x i64>, <4 x i64> *%a2, align 32
%3 = or <4 x i64> %1, %2
@ -251,9 +251,9 @@ define <32 x i8> @test_psubb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> *%a2) {
;
; ZNVER1-LABEL: test_psubb:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: vpsubb %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpsubb (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = sub <32 x i8> %a0, %a1
%2 = load <32 x i8>, <32 x i8> *%a2, align 32
%3 = sub <32 x i8> %1, %2
@ -269,9 +269,9 @@ define <8 x i32> @test_psubd(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> *%a2) {
;
; ZNVER1-LABEL: test_psubd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpsubd (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = sub <8 x i32> %a0, %a1
%2 = load <8 x i32>, <8 x i32> *%a2, align 32
%3 = sub <8 x i32> %1, %2
@ -287,9 +287,9 @@ define <4 x i64> @test_psubq(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
;
; ZNVER1-LABEL: test_psubq:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: vpsubq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpsubq (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = sub <4 x i64> %a0, %a1
%2 = load <4 x i64>, <4 x i64> *%a2, align 32
%3 = sub <4 x i64> %1, %2
@ -305,9 +305,9 @@ define <16 x i16> @test_psubw(<16 x i16> %a0, <16 x i16> %a1, <16 x i16> *%a2) {
;
; ZNVER1-LABEL: test_psubw:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpsubw (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = sub <16 x i16> %a0, %a1
%2 = load <16 x i16>, <16 x i16> *%a2, align 32
%3 = sub <16 x i16> %1, %2
@ -324,10 +324,10 @@ define <4 x i64> @test_pxor(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> *%a2) {
;
; ZNVER1-LABEL: test_pxor:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [6:1.00]
; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: vpxor %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpxor (%rdi), %ymm0, %ymm0 # sched: [8:0.50]
; ZNVER1-NEXT: vpaddq %ymm1, %ymm0, %ymm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = xor <4 x i64> %a0, %a1
%2 = load <4 x i64>, <4 x i64> *%a2, align 32
%3 = xor <4 x i64> %1, %2

View File

@ -4,7 +4,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
define i16 @test_andn_i16(i16 zeroext %a0, i16 zeroext %a1, i16 *%a2) {
; GENERIC-LABEL: test_andn_i16:
@ -33,6 +33,15 @@ define i16 @test_andn_i16(i16 zeroext %a0, i16 zeroext %a1, i16 *%a2) {
; BTVER2-NEXT: addl %edi, %eax # sched: [1:0.50]
; BTVER2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andn_i16:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: andnl %esi, %edi, %eax # sched: [1:0.25]
; ZNVER1-NEXT: notl %edi # sched: [1:0.25]
; ZNVER1-NEXT: andw (%rdx), %di # sched: [5:0.50]
; ZNVER1-NEXT: addl %edi, %eax # sched: [1:0.25]
; ZNVER1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i16, i16 *%a2
%2 = xor i16 %a0, -1
%3 = and i16 %2, %a1
@ -62,6 +71,13 @@ define i32 @test_andn_i32(i32 %a0, i32 %a1, i32 *%a2) {
; BTVER2-NEXT: andnl %esi, %edi, %ecx # sched: [1:0.50]
; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andn_i32:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: andnl (%rdx), %edi, %eax # sched: [5:0.50]
; ZNVER1-NEXT: andnl %esi, %edi, %ecx # sched: [1:0.25]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i32, i32 *%a2
%2 = xor i32 %a0, -1
%3 = and i32 %2, %a1
@ -91,6 +107,13 @@ define i64 @test_andn_i64(i64 %a0, i64 %a1, i64 *%a2) {
; BTVER2-NEXT: andnq %rsi, %rdi, %rcx # sched: [1:0.50]
; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andn_i64:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: andnq (%rdx), %rdi, %rax # sched: [5:0.50]
; ZNVER1-NEXT: andnq %rsi, %rdi, %rcx # sched: [1:0.25]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i64, i64 *%a2
%2 = xor i64 %a0, -1
%3 = and i64 %2, %a1
@ -120,6 +143,13 @@ define i32 @test_bextr_i32(i32 %a0, i32 %a1, i32 *%a2) {
; BTVER2-NEXT: bextrl %edi, %esi, %eax # sched: [?:0.000000e+00]
; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_bextr_i32:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: bextrl %edi, (%rdx), %ecx # sched: [?:0.000000e+00]
; ZNVER1-NEXT: bextrl %edi, %esi, %eax # sched: [?:0.000000e+00]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i32, i32 *%a2
%2 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %1, i32 %a0)
%3 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %a1, i32 %a0)
@ -149,6 +179,13 @@ define i64 @test_bextr_i64(i64 %a0, i64 %a1, i64 *%a2) {
; BTVER2-NEXT: bextrq %rdi, %rsi, %rax # sched: [?:0.000000e+00]
; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_bextr_i64:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [?:0.000000e+00]
; ZNVER1-NEXT: bextrq %rdi, %rsi, %rax # sched: [?:0.000000e+00]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i64, i64 *%a2
%2 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %1, i64 %a0)
%3 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %a1, i64 %a0)
@ -178,6 +215,13 @@ define i32 @test_blsi_i32(i32 %a0, i32 *%a1) {
; BTVER2-NEXT: blsil %edi, %eax # sched: [?:0.000000e+00]
; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blsi_i32:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: blsil (%rsi), %ecx # sched: [?:0.000000e+00]
; ZNVER1-NEXT: blsil %edi, %eax # sched: [?:0.000000e+00]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i32, i32 *%a1
%2 = sub i32 0, %1
%3 = sub i32 0, %a0
@ -208,6 +252,13 @@ define i64 @test_blsi_i64(i64 %a0, i64 *%a1) {
; BTVER2-NEXT: blsiq %rdi, %rax # sched: [?:0.000000e+00]
; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blsi_i64:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: blsiq (%rsi), %rcx # sched: [?:0.000000e+00]
; ZNVER1-NEXT: blsiq %rdi, %rax # sched: [?:0.000000e+00]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i64, i64 *%a1
%2 = sub i64 0, %1
%3 = sub i64 0, %a0
@ -238,6 +289,13 @@ define i32 @test_blsmsk_i32(i32 %a0, i32 *%a1) {
; BTVER2-NEXT: blsmskl %edi, %eax # sched: [?:0.000000e+00]
; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blsmsk_i32:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: blsmskl (%rsi), %ecx # sched: [?:0.000000e+00]
; ZNVER1-NEXT: blsmskl %edi, %eax # sched: [?:0.000000e+00]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i32, i32 *%a1
%2 = sub i32 %1, 1
%3 = sub i32 %a0, 1
@ -268,6 +326,13 @@ define i64 @test_blsmsk_i64(i64 %a0, i64 *%a1) {
; BTVER2-NEXT: blsmskq %rdi, %rax # sched: [?:0.000000e+00]
; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blsmsk_i64:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: blsmskq (%rsi), %rcx # sched: [?:0.000000e+00]
; ZNVER1-NEXT: blsmskq %rdi, %rax # sched: [?:0.000000e+00]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i64, i64 *%a1
%2 = sub i64 %1, 1
%3 = sub i64 %a0, 1
@ -298,6 +363,13 @@ define i32 @test_blsr_i32(i32 %a0, i32 *%a1) {
; BTVER2-NEXT: blsrl %edi, %eax # sched: [?:0.000000e+00]
; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blsr_i32:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: blsrl (%rsi), %ecx # sched: [?:0.000000e+00]
; ZNVER1-NEXT: blsrl %edi, %eax # sched: [?:0.000000e+00]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i32, i32 *%a1
%2 = sub i32 %1, 1
%3 = sub i32 %a0, 1
@ -328,6 +400,13 @@ define i64 @test_blsr_i64(i64 %a0, i64 *%a1) {
; BTVER2-NEXT: blsrq %rdi, %rax # sched: [?:0.000000e+00]
; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blsr_i64:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: blsrq (%rsi), %rcx # sched: [?:0.000000e+00]
; ZNVER1-NEXT: blsrq %rdi, %rax # sched: [?:0.000000e+00]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i64, i64 *%a1
%2 = sub i64 %1, 1
%3 = sub i64 %a0, 1
@ -361,6 +440,14 @@ define i16 @test_cttz_i16(i16 zeroext %a0, i16 *%a1) {
; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cttz_i16:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: tzcntw (%rsi), %cx # sched: [?:0.000000e+00]
; ZNVER1-NEXT: tzcntw %di, %ax # sched: [?:0.000000e+00]
; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
; ZNVER1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i16, i16 *%a1
%2 = tail call i16 @llvm.cttz.i16( i16 %1, i1 false )
%3 = tail call i16 @llvm.cttz.i16( i16 %a0, i1 false )
@ -390,6 +477,13 @@ define i32 @test_cttz_i32(i32 %a0, i32 *%a1) {
; BTVER2-NEXT: tzcntl %edi, %eax # sched: [?:0.000000e+00]
; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cttz_i32:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: tzcntl (%rsi), %ecx # sched: [?:0.000000e+00]
; ZNVER1-NEXT: tzcntl %edi, %eax # sched: [?:0.000000e+00]
; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i32, i32 *%a1
%2 = tail call i32 @llvm.cttz.i32( i32 %1, i1 false )
%3 = tail call i32 @llvm.cttz.i32( i32 %a0, i1 false )
@ -419,6 +513,13 @@ define i64 @test_cttz_i64(i64 %a0, i64 *%a1) {
; BTVER2-NEXT: tzcntq %rdi, %rax # sched: [?:0.000000e+00]
; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cttz_i64:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: tzcntq (%rsi), %rcx # sched: [?:0.000000e+00]
; ZNVER1-NEXT: tzcntq %rdi, %rax # sched: [?:0.000000e+00]
; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i64, i64 *%a1
%2 = tail call i64 @llvm.cttz.i64( i64 %1, i1 false )
%3 = tail call i64 @llvm.cttz.i64( i64 %a0, i1 false )

View File

@ -24,8 +24,8 @@ define i32 @test_bzhi_i32(i32 %a0, i32 %a1, i32 *%a2) {
; ZNVER1: # BB#0:
; ZNVER1-NEXT: bzhil %edi, (%rdx), %ecx # sched: [?:0.000000e+00]
; ZNVER1-NEXT: bzhil %edi, %esi, %eax # sched: [?:0.000000e+00]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i32, i32 *%a2
%2 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %1, i32 %a0)
%3 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %a1, i32 %a0)
@ -53,8 +53,8 @@ define i64 @test_bzhi_i64(i64 %a0, i64 %a1, i64 *%a2) {
; ZNVER1: # BB#0:
; ZNVER1-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [?:0.000000e+00]
; ZNVER1-NEXT: bzhiq %rdi, %rsi, %rax # sched: [?:0.000000e+00]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i64, i64 *%a2
%2 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %1, i64 %a0)
%3 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %a1, i64 %a0)
@ -82,8 +82,8 @@ define i32 @test_pdep_i32(i32 %a0, i32 %a1, i32 *%a2) {
; ZNVER1: # BB#0:
; ZNVER1-NEXT: pdepl (%rdx), %edi, %ecx # sched: [?:0.000000e+00]
; ZNVER1-NEXT: pdepl %esi, %edi, %eax # sched: [?:0.000000e+00]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i32, i32 *%a2
%2 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %1)
%3 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %a1)
@ -111,8 +111,8 @@ define i64 @test_pdep_i64(i64 %a0, i64 %a1, i64 *%a2) {
; ZNVER1: # BB#0:
; ZNVER1-NEXT: pdepq (%rdx), %rdi, %rcx # sched: [?:0.000000e+00]
; ZNVER1-NEXT: pdepq %rsi, %rdi, %rax # sched: [?:0.000000e+00]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i64, i64 *%a2
%2 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %1)
%3 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %a1)
@ -140,8 +140,8 @@ define i32 @test_pext_i32(i32 %a0, i32 %a1, i32 *%a2) {
; ZNVER1: # BB#0:
; ZNVER1-NEXT: pextl (%rdx), %edi, %ecx # sched: [?:0.000000e+00]
; ZNVER1-NEXT: pextl %esi, %edi, %eax # sched: [?:0.000000e+00]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i32, i32 *%a2
%2 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %1)
%3 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %a1)
@ -169,8 +169,8 @@ define i64 @test_pext_i64(i64 %a0, i64 %a1, i64 *%a2) {
; ZNVER1: # BB#0:
; ZNVER1-NEXT: pextq (%rdx), %rdi, %rcx # sched: [?:0.000000e+00]
; ZNVER1-NEXT: pextq %rsi, %rdi, %rax # sched: [?:0.000000e+00]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i64, i64 *%a2
%2 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %1)
%3 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %a1)

View File

@ -29,10 +29,10 @@ define <4 x float> @test_vcvtph2ps_128(<8 x i16> %a0, <8 x i16> *%a1) {
;
; ZNVER1-LABEL: test_vcvtph2ps_128:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [8:1.00]
; ZNVER1-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vcvtph2ps (%rdi), %xmm1 # sched: [12:1.00]
; ZNVER1-NEXT: vcvtph2ps %xmm0, %xmm0 # sched: [5:1.00]
; ZNVER1-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load <8 x i16>, <8 x i16> *%a1
%2 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %1)
%3 = call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %a0)
@ -65,10 +65,10 @@ define <8 x float> @test_vcvtph2ps_256(<8 x i16> %a0, <8 x i16> *%a1) {
;
; ZNVER1-LABEL: test_vcvtph2ps_256:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [8:1.00]
; ZNVER1-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [3:1.00]
; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:2.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: vcvtph2ps (%rdi), %ymm1 # sched: [12:1.00]
; ZNVER1-NEXT: vcvtph2ps %xmm0, %ymm0 # sched: [5:1.00]
; ZNVER1-NEXT: vaddps %ymm0, %ymm1, %ymm0 # sched: [3:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load <8 x i16>, <8 x i16> *%a1
%2 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %1)
%3 = call <8 x float> @llvm.x86.vcvtph2ps.256(<8 x i16> %a0)
@ -98,9 +98,9 @@ define <8 x i16> @test_vcvtps2ph_128(<4 x float> %a0, <4 x float> %a1, <4 x i16>
;
; ZNVER1-LABEL: test_vcvtps2ph_128:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [8:1.00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: vcvtps2ph $0, %xmm0, %xmm0 # sched: [5:1.00]
; ZNVER1-NEXT: vcvtps2ph $0, %xmm1, (%rdi) # sched: [12:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a0, i32 0)
%2 = call <8 x i16> @llvm.x86.vcvtps2ph.128(<4 x float> %a1, i32 0)
%3 = shufflevector <8 x i16> %2, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@ -132,10 +132,10 @@ define <8 x i16> @test_vcvtps2ph_256(<8 x float> %a0, <8 x float> %a1, <8 x i16>
;
; ZNVER1-LABEL: test_vcvtps2ph_256:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [8:1.00]
; ZNVER1-NEXT: vcvtps2ph $0, %ymm0, %xmm0 # sched: [5:1.00]
; ZNVER1-NEXT: vcvtps2ph $0, %ymm1, (%rdi) # sched: [12:1.00]
; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00]
; ZNVER1-NEXT: retq # sched: [4:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a0, i32 0)
%2 = call <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float> %a1, i32 0)
store <8 x i16> %2, <8 x i16> *%a2

View File

@ -8,7 +8,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
define i32 @test_lea_offset(i32) {
; GENERIC-LABEL: test_lea_offset:
@ -52,6 +52,12 @@ define i32 @test_lea_offset(i32) {
; BTVER2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; BTVER2-NEXT: leal -24(%rdi), %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lea_offset:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; ZNVER1-NEXT: leal -24(%rdi), %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%2 = add nsw i32 %0, -24
ret i32 %2
}
@ -98,6 +104,12 @@ define i32 @test_lea_offset_big(i32) {
; BTVER2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; BTVER2-NEXT: leal 1024(%rdi), %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lea_offset_big:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; ZNVER1-NEXT: leal 1024(%rdi), %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%2 = add nsw i32 %0, 1024
ret i32 %2
}
@ -151,6 +163,13 @@ define i32 @test_lea_add(i32, i32) {
; BTVER2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; BTVER2-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lea_add:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
; ZNVER1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; ZNVER1-NEXT: leal (%rdi,%rsi), %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%3 = add nsw i32 %1, %0
ret i32 %3
}
@ -205,6 +224,13 @@ define i32 @test_lea_add_offset(i32, i32) {
; BTVER2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; BTVER2-NEXT: leal 16(%rdi,%rsi), %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lea_add_offset:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
; ZNVER1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; ZNVER1-NEXT: leal 16(%rdi,%rsi), %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%3 = add i32 %0, 16
%4 = add i32 %3, %1
ret i32 %4
@ -262,6 +288,13 @@ define i32 @test_lea_add_offset_big(i32, i32) {
; BTVER2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; BTVER2-NEXT: leal -4096(%rdi,%rsi), %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lea_add_offset_big:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
; ZNVER1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; ZNVER1-NEXT: leal -4096(%rdi,%rsi), %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%3 = add i32 %0, -4096
%4 = add i32 %3, %1
ret i32 %4
@ -309,6 +342,12 @@ define i32 @test_lea_mul(i32) {
; BTVER2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; BTVER2-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lea_mul:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; ZNVER1-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%2 = mul nsw i32 %0, 3
ret i32 %2
}
@ -357,6 +396,12 @@ define i32 @test_lea_mul_offset(i32) {
; BTVER2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; BTVER2-NEXT: leal -32(%rdi,%rdi,2), %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lea_mul_offset:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; ZNVER1-NEXT: leal -32(%rdi,%rdi,2), %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%2 = mul nsw i32 %0, 3
%3 = add nsw i32 %2, -32
ret i32 %3
@ -408,6 +453,12 @@ define i32 @test_lea_mul_offset_big(i32) {
; BTVER2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; BTVER2-NEXT: leal 10000(%rdi,%rdi,8), %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lea_mul_offset_big:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; ZNVER1-NEXT: leal 10000(%rdi,%rdi,8), %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%2 = mul nsw i32 %0, 9
%3 = add nsw i32 %2, 10000
ret i32 %3
@ -461,6 +512,13 @@ define i32 @test_lea_add_scale(i32, i32) {
; BTVER2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; BTVER2-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lea_add_scale:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
; ZNVER1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; ZNVER1-NEXT: leal (%rdi,%rsi,2), %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%3 = shl i32 %1, 1
%4 = add nsw i32 %3, %0
ret i32 %4
@ -516,6 +574,13 @@ define i32 @test_lea_add_scale_offset(i32, i32) {
; BTVER2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; BTVER2-NEXT: leal 96(%rdi,%rsi,4), %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lea_add_scale_offset:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
; ZNVER1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; ZNVER1-NEXT: leal 96(%rdi,%rsi,4), %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%3 = shl i32 %1, 2
%4 = add i32 %0, 96
%5 = add i32 %4, %3
@ -574,6 +639,13 @@ define i32 @test_lea_add_scale_offset_big(i32, i32) {
; BTVER2-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; BTVER2-NEXT: leal -1200(%rdi,%rsi,8), %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lea_add_scale_offset_big:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: # kill: %ESI<def> %ESI<kill> %RSI<def>
; ZNVER1-NEXT: # kill: %EDI<def> %EDI<kill> %RDI<def>
; ZNVER1-NEXT: leal -1200(%rdi,%rsi,8), %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%3 = shl i32 %1, 3
%4 = add i32 %0, -1200
%5 = add i32 %4, %3

View File

@ -8,7 +8,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
define i64 @test_lea_offset(i64) {
; GENERIC-LABEL: test_lea_offset:
@ -46,6 +46,11 @@ define i64 @test_lea_offset(i64) {
; BTVER2: # BB#0:
; BTVER2-NEXT: leaq -24(%rdi), %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lea_offset:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: leaq -24(%rdi), %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%2 = add nsw i64 %0, -24
ret i64 %2
}
@ -86,6 +91,11 @@ define i64 @test_lea_offset_big(i64) {
; BTVER2: # BB#0:
; BTVER2-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lea_offset_big:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: leaq 1024(%rdi), %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%2 = add nsw i64 %0, 1024
ret i64 %2
}
@ -127,6 +137,11 @@ define i64 @test_lea_add(i64, i64) {
; BTVER2: # BB#0:
; BTVER2-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lea_add:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: leaq (%rdi,%rsi), %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%3 = add nsw i64 %1, %0
ret i64 %3
}
@ -169,6 +184,11 @@ define i64 @test_lea_add_offset(i64, i64) {
; BTVER2: # BB#0:
; BTVER2-NEXT: leaq 16(%rdi,%rsi), %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lea_add_offset:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: leaq 16(%rdi,%rsi), %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%3 = add i64 %0, 16
%4 = add i64 %3, %1
ret i64 %4
@ -214,6 +234,11 @@ define i64 @test_lea_add_offset_big(i64, i64) {
; BTVER2: # BB#0:
; BTVER2-NEXT: leaq -4096(%rdi,%rsi), %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lea_add_offset_big:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: leaq -4096(%rdi,%rsi), %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%3 = add i64 %0, -4096
%4 = add i64 %3, %1
ret i64 %4
@ -255,6 +280,11 @@ define i64 @test_lea_mul(i64) {
; BTVER2: # BB#0:
; BTVER2-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lea_mul:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: leaq (%rdi,%rdi,2), %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%2 = mul nsw i64 %0, 3
ret i64 %2
}
@ -297,6 +327,11 @@ define i64 @test_lea_mul_offset(i64) {
; BTVER2: # BB#0:
; BTVER2-NEXT: leaq -32(%rdi,%rdi,2), %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lea_mul_offset:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: leaq -32(%rdi,%rdi,2), %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%2 = mul nsw i64 %0, 3
%3 = add nsw i64 %2, -32
ret i64 %3
@ -342,6 +377,11 @@ define i64 @test_lea_mul_offset_big(i64) {
; BTVER2: # BB#0:
; BTVER2-NEXT: leaq 10000(%rdi,%rdi,8), %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lea_mul_offset_big:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: leaq 10000(%rdi,%rdi,8), %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%2 = mul nsw i64 %0, 9
%3 = add nsw i64 %2, 10000
ret i64 %3
@ -383,6 +423,11 @@ define i64 @test_lea_add_scale(i64, i64) {
; BTVER2: # BB#0:
; BTVER2-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lea_add_scale:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: leaq (%rdi,%rsi,2), %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%3 = shl i64 %1, 1
%4 = add nsw i64 %3, %0
ret i64 %4
@ -426,6 +471,11 @@ define i64 @test_lea_add_scale_offset(i64, i64) {
; BTVER2: # BB#0:
; BTVER2-NEXT: leaq 96(%rdi,%rsi,4), %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lea_add_scale_offset:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: leaq 96(%rdi,%rsi,4), %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%3 = shl i64 %1, 2
%4 = add i64 %0, 96
%5 = add i64 %4, %3
@ -472,6 +522,11 @@ define i64 @test_lea_add_scale_offset_big(i64, i64) {
; BTVER2: # BB#0:
; BTVER2-NEXT: leaq -1200(%rdi,%rsi,8), %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lea_add_scale_offset_big:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: leaq -1200(%rdi,%rsi,8), %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%3 = shl i64 %1, 3
%4 = add i64 %0, -1200
%5 = add i64 %4, %3

View File

@ -4,7 +4,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
define i16 @test_ctlz_i16(i16 zeroext %a0, i16 *%a1) {
; GENERIC-LABEL: test_ctlz_i16:
@ -30,6 +30,14 @@ define i16 @test_ctlz_i16(i16 zeroext %a0, i16 *%a1) {
; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_ctlz_i16:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: lzcntw (%rsi), %cx
; ZNVER1-NEXT: lzcntw %di, %ax
; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
; ZNVER1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i16, i16 *%a1
%2 = tail call i16 @llvm.ctlz.i16( i16 %1, i1 false )
%3 = tail call i16 @llvm.ctlz.i16( i16 %a0, i1 false )
@ -59,6 +67,13 @@ define i32 @test_ctlz_i32(i32 %a0, i32 *%a1) {
; BTVER2-NEXT: lzcntl %edi, %eax
; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_ctlz_i32:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: lzcntl (%rsi), %ecx
; ZNVER1-NEXT: lzcntl %edi, %eax
; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i32, i32 *%a1
%2 = tail call i32 @llvm.ctlz.i32( i32 %1, i1 false )
%3 = tail call i32 @llvm.ctlz.i32( i32 %a0, i1 false )
@ -88,6 +103,13 @@ define i64 @test_ctlz_i64(i64 %a0, i64 *%a1) {
; BTVER2-NEXT: lzcntq %rdi, %rax
; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_ctlz_i64:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: lzcntq (%rsi), %rcx
; ZNVER1-NEXT: lzcntq %rdi, %rax
; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i64, i64 *%a1
%2 = tail call i64 @llvm.ctlz.i64( i64 %1, i1 false )
%3 = tail call i64 @llvm.ctlz.i64( i64 %a0, i1 false )

View File

@ -8,7 +8,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
define i16 @test_ctpop_i16(i16 zeroext %a0, i16 *%a1) {
; GENERIC-LABEL: test_ctpop_i16:
@ -50,6 +50,14 @@ define i16 @test_ctpop_i16(i16 zeroext %a0, i16 *%a1) {
; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_ctpop_i16:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: popcntw (%rsi), %cx # sched: [10:1.00]
; ZNVER1-NEXT: popcntw %di, %ax # sched: [3:1.00]
; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
; ZNVER1-NEXT: # kill: %AX<def> %AX<kill> %EAX<kill>
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i16, i16 *%a1
%2 = tail call i16 @llvm.ctpop.i16( i16 %1 )
%3 = tail call i16 @llvm.ctpop.i16( i16 %a0 )
@ -93,6 +101,13 @@ define i32 @test_ctpop_i32(i32 %a0, i32 *%a1) {
; BTVER2-NEXT: popcntl %edi, %eax # sched: [3:1.00]
; BTVER2-NEXT: orl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_ctpop_i32:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: popcntl (%rsi), %ecx # sched: [10:1.00]
; ZNVER1-NEXT: popcntl %edi, %eax # sched: [3:1.00]
; ZNVER1-NEXT: orl %ecx, %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i32, i32 *%a1
%2 = tail call i32 @llvm.ctpop.i32( i32 %1 )
%3 = tail call i32 @llvm.ctpop.i32( i32 %a0 )
@ -136,6 +151,13 @@ define i64 @test_ctpop_i64(i64 %a0, i64 *%a1) {
; BTVER2-NEXT: popcntq %rdi, %rax # sched: [3:1.00]
; BTVER2-NEXT: orq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_ctpop_i64:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: popcntq (%rsi), %rcx # sched: [10:1.00]
; ZNVER1-NEXT: popcntq %rdi, %rax # sched: [3:1.00]
; ZNVER1-NEXT: orq %rcx, %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load i64, i64 *%a1
%2 = tail call i64 @llvm.ctpop.i64( i64 %1 )
%3 = tail call i64 @llvm.ctpop.i64( i64 %a0 )

View File

@ -7,7 +7,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
define <4 x float> @test_addps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; GENERIC-LABEL: test_addps:
@ -45,6 +45,12 @@ define <4 x float> @test_addps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_addps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vaddps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fadd <4 x float> %a0, %a1
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = fadd <4 x float> %1, %2
@ -87,6 +93,12 @@ define float @test_addss(float %a0, float %a1, float *%a2) {
; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_addss:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vaddss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fadd float %a0, %a1
%2 = load float, float *%a2, align 4
%3 = fadd float %1, %2
@ -137,6 +149,12 @@ define <4 x float> @test_andps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; BTVER2-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vandps %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vandps (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = bitcast <4 x float> %a0 to <4 x i32>
%2 = bitcast <4 x float> %a1 to <4 x i32>
%3 = and <4 x i32> %1, %2
@ -191,6 +209,12 @@ define <4 x float> @test_andnotps(<4 x float> %a0, <4 x float> %a1, <4 x float>
; BTVER2-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_andnotps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vandnps %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vandnps (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = bitcast <4 x float> %a0 to <4 x i32>
%2 = bitcast <4 x float> %a1 to <4 x i32>
%3 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1>
@ -245,6 +269,13 @@ define <4 x float> @test_cmpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; BTVER2-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cmpps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vcmpeqps %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
; ZNVER1-NEXT: vcmpeqps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: vorps %xmm0, %xmm1, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fcmp oeq <4 x float> %a0, %a1
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = fcmp oeq <4 x float> %a0, %2
@ -290,6 +321,12 @@ define float @test_cmpss(float %a0, float %a1, float *%a2) {
; BTVER2-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cmpss:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vcmpeqss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vcmpeqss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = insertelement <4 x float> undef, float %a0, i32 0
%2 = insertelement <4 x float> undef, float %a1, i32 0
%3 = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %1, <4 x float> %2, i8 0)
@ -385,6 +422,20 @@ define i32 @test_comiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; BTVER2-NEXT: orb %cl, %dl # sched: [1:0.50]
; BTVER2-NEXT: movzbl %dl, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_comiss:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vcomiss %xmm1, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: setnp %al # sched: [1:0.25]
; ZNVER1-NEXT: sete %cl # sched: [1:0.25]
; ZNVER1-NEXT: andb %al, %cl # sched: [1:0.25]
; ZNVER1-NEXT: vcomiss (%rdi), %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: setnp %al # sched: [1:0.25]
; ZNVER1-NEXT: sete %dl # sched: [1:0.25]
; ZNVER1-NEXT: andb %al, %dl # sched: [1:0.25]
; ZNVER1-NEXT: orb %cl, %dl # sched: [1:0.25]
; ZNVER1-NEXT: movzbl %dl, %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1)
%2 = load <4 x float>, <4 x float> *%a2, align 4
%3 = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %2)
@ -435,6 +486,13 @@ define float @test_cvtsi2ss(i32 %a0, i32 *%a1) {
; BTVER2-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtsi2ss:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vcvtsi2ssl %edi, %xmm0, %xmm0 # sched: [5:1.00]
; ZNVER1-NEXT: vcvtsi2ssl (%rsi), %xmm1, %xmm1 # sched: [12:1.00]
; ZNVER1-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = sitofp i32 %a0 to float
%2 = load i32, i32 *%a1, align 4
%3 = sitofp i32 %2 to float
@ -484,6 +542,13 @@ define float @test_cvtsi2ssq(i64 %a0, i64 *%a1) {
; BTVER2-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [8:1.00]
; BTVER2-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtsi2ssq:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vcvtsi2ssq %rdi, %xmm0, %xmm0 # sched: [5:1.00]
; ZNVER1-NEXT: vcvtsi2ssq (%rsi), %xmm1, %xmm1 # sched: [12:1.00]
; ZNVER1-NEXT: vaddss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = sitofp i64 %a0 to float
%2 = load i64, i64 *%a1, align 8
%3 = sitofp i64 %2 to float
@ -533,6 +598,13 @@ define i32 @test_cvtss2si(float %a0, float *%a1) {
; BTVER2-NEXT: vcvtss2si %xmm0, %ecx # sched: [3:1.00]
; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtss2si:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vcvtss2si (%rdi), %eax # sched: [12:1.00]
; ZNVER1-NEXT: vcvtss2si %xmm0, %ecx # sched: [5:1.00]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = insertelement <4 x float> undef, float %a0, i32 0
%2 = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %1)
%3 = load float, float *%a1, align 4
@ -585,6 +657,13 @@ define i64 @test_cvtss2siq(float %a0, float *%a1) {
; BTVER2-NEXT: vcvtss2si %xmm0, %rcx # sched: [3:1.00]
; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvtss2siq:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vcvtss2si (%rdi), %rax # sched: [12:1.00]
; ZNVER1-NEXT: vcvtss2si %xmm0, %rcx # sched: [5:1.00]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = insertelement <4 x float> undef, float %a0, i32 0
%2 = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %1)
%3 = load float, float *%a1, align 4
@ -637,6 +716,13 @@ define i32 @test_cvttss2si(float %a0, float *%a1) {
; BTVER2-NEXT: vcvttss2si %xmm0, %ecx # sched: [3:1.00]
; BTVER2-NEXT: addl %ecx, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvttss2si:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vcvttss2si (%rdi), %eax # sched: [12:1.00]
; ZNVER1-NEXT: vcvttss2si %xmm0, %ecx # sched: [5:1.00]
; ZNVER1-NEXT: addl %ecx, %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fptosi float %a0 to i32
%2 = load float, float *%a1, align 4
%3 = fptosi float %2 to i32
@ -686,6 +772,13 @@ define i64 @test_cvttss2siq(float %a0, float *%a1) {
; BTVER2-NEXT: vcvttss2si %xmm0, %rcx # sched: [3:1.00]
; BTVER2-NEXT: addq %rcx, %rax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_cvttss2siq:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vcvttss2si (%rdi), %rax # sched: [12:1.00]
; ZNVER1-NEXT: vcvttss2si %xmm0, %rcx # sched: [5:1.00]
; ZNVER1-NEXT: addq %rcx, %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fptosi float %a0 to i64
%2 = load float, float *%a1, align 4
%3 = fptosi float %2 to i64
@ -729,6 +822,12 @@ define <4 x float> @test_divps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; BTVER2-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [19:19.00]
; BTVER2-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [24:19.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_divps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vdivps %xmm1, %xmm0, %xmm0 # sched: [15:1.00]
; ZNVER1-NEXT: vdivps (%rdi), %xmm0, %xmm0 # sched: [22:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fdiv <4 x float> %a0, %a1
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = fdiv <4 x float> %1, %2
@ -771,6 +870,12 @@ define float @test_divss(float %a0, float %a1, float *%a2) {
; BTVER2-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [19:19.00]
; BTVER2-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [24:19.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_divss:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vdivss %xmm1, %xmm0, %xmm0 # sched: [15:1.00]
; ZNVER1-NEXT: vdivss (%rdi), %xmm0, %xmm0 # sched: [22:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fdiv float %a0, %a1
%2 = load float, float *%a2, align 4
%3 = fdiv float %1, %2
@ -813,6 +918,12 @@ define void @test_ldmxcsr(i32 %a0) {
; BTVER2-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:1.00]
; BTVER2-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [5:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_ldmxcsr:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # sched: [1:0.50]
; ZNVER1-NEXT: vldmxcsr -{{[0-9]+}}(%rsp) # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = alloca i32, align 4
%2 = bitcast i32* %1 to i8*
store i32 %a0, i32* %1
@ -857,6 +968,12 @@ define <4 x float> @test_maxps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; BTVER2-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_maxps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmaxps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmaxps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %1, <4 x float> %2)
@ -900,6 +1017,12 @@ define <4 x float> @test_maxss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; BTVER2-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_maxss:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmaxss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmaxss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1)
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %1, <4 x float> %2)
@ -943,6 +1066,12 @@ define <4 x float> @test_minps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; BTVER2-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_minps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vminps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vminps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %1, <4 x float> %2)
@ -986,6 +1115,12 @@ define <4 x float> @test_minss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; BTVER2-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_minss:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vminss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vminss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1)
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %1, <4 x float> %2)
@ -1035,6 +1170,13 @@ define void @test_movaps(<4 x float> *%a0, <4 x float> *%a1) {
; BTVER2-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movaps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovaps (%rdi), %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmovaps %xmm0, (%rsi) # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load <4 x float>, <4 x float> *%a0, align 16
%2 = fadd <4 x float> %1, %1
store <4 x float> %2, <4 x float> *%a1, align 16
@ -1079,6 +1221,11 @@ define <4 x float> @test_movhlps(<4 x float> %a0, <4 x float> %a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movhlps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm1[1],xmm0[1] sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 6, i32 7, i32 2, i32 3>
ret <4 x float> %1
}
@ -1129,6 +1276,13 @@ define void @test_movhps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) {
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movhps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] sched: [8:0.50]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [8:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = bitcast x86_mmx* %a2 to <2 x float>*
%2 = load <2 x float>, <2 x float> *%1, align 8
%3 = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@ -1177,6 +1331,12 @@ define <4 x float> @test_movlhps(<4 x float> %a0, <4 x float> %a1) {
; BTVER2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movlhps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] sched: [1:0.50]
; ZNVER1-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
%2 = fadd <4 x float> %a1, %1
ret <4 x float> %2
@ -1224,6 +1384,13 @@ define void @test_movlps(<4 x float> %a0, <4 x float> %a1, x86_mmx *%a2) {
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movlps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovlpd {{.*#+}} xmm1 = mem[0],xmm1[1] sched: [8:0.50]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmovlps %xmm0, (%rdi) # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = bitcast x86_mmx* %a2 to <2 x float>*
%2 = load <2 x float>, <2 x float> *%1, align 8
%3 = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@ -1266,6 +1433,11 @@ define i32 @test_movmskps(<4 x float> %a0) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovmskps %xmm0, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movmskps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovmskps %xmm0, %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0)
ret i32 %1
}
@ -1307,6 +1479,11 @@ define void @test_movntps(<4 x float> %a0, <4 x float> *%a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movntps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovntps %xmm0, (%rdi) # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
store <4 x float> %a0, <4 x float> *%a1, align 16, !nontemporal !0
ret void
}
@ -1353,6 +1530,13 @@ define void @test_movss_mem(float* %a0, float* %a1) {
; BTVER2-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmovss %xmm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movss_mem:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero sched: [8:0.50]
; ZNVER1-NEXT: vaddss %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmovss %xmm0, (%rsi) # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load float, float* %a0, align 1
%2 = fadd float %1, %1
store float %2, float *%a1, align 1
@ -1395,6 +1579,11 @@ define <4 x float> @test_movss_reg(<4 x float> %a0, <4 x float> %a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movss_reg:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 4, i32 1, i32 2, i32 3>
ret <4 x float> %1
}
@ -1441,6 +1630,13 @@ define void @test_movups(<4 x float> *%a0, <4 x float> *%a1) {
; BTVER2-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vmovups %xmm0, (%rsi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movups:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovups (%rdi), %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: vaddps %xmm0, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vmovups %xmm0, (%rsi) # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load <4 x float>, <4 x float> *%a0, align 1
%2 = fadd <4 x float> %1, %1
store <4 x float> %2, <4 x float> *%a1, align 1
@ -1483,6 +1679,12 @@ define <4 x float> @test_mulps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; BTVER2-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_mulps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmulps %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
; ZNVER1-NEXT: vmulps (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fmul <4 x float> %a0, %a1
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = fmul <4 x float> %1, %2
@ -1525,6 +1727,12 @@ define float @test_mulss(float %a0, float %a1, float *%a2) {
; BTVER2-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_mulss:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmulss %xmm1, %xmm0, %xmm0 # sched: [5:1.00]
; ZNVER1-NEXT: vmulss (%rdi), %xmm0, %xmm0 # sched: [12:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fmul float %a0, %a1
%2 = load float, float *%a2, align 4
%3 = fmul float %1, %2
@ -1575,6 +1783,12 @@ define <4 x float> @test_orps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2
; BTVER2-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_orps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vorps %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vorps (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = bitcast <4 x float> %a0 to <4 x i32>
%2 = bitcast <4 x float> %a1 to <4 x i32>
%3 = or <4 x i32> %1, %2
@ -1621,6 +1835,11 @@ define void @test_prefetchnta(i8* %a0) {
; BTVER2: # BB#0:
; BTVER2-NEXT: prefetchnta (%rdi) # sched: [5:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_prefetchnta:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: prefetchnta (%rdi) # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
call void @llvm.prefetch(i8* %a0, i32 0, i32 0, i32 1)
ret void
}
@ -1670,6 +1889,13 @@ define <4 x float> @test_rcpps(<4 x float> %a0, <4 x float> *%a1) {
; BTVER2-NEXT: vrcpps %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_rcpps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vrcpps (%rdi), %xmm1 # sched: [12:0.50]
; ZNVER1-NEXT: vrcpps %xmm0, %xmm0 # sched: [5:0.50]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0)
%2 = load <4 x float>, <4 x float> *%a1, align 16
%3 = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %2)
@ -1728,6 +1954,14 @@ define <4 x float> @test_rcpss(float %a0, float *%a1) {
; BTVER2-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [7:1.00]
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_rcpss:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [8:0.50]
; ZNVER1-NEXT: vrcpss %xmm0, %xmm0, %xmm0 # sched: [12:0.50]
; ZNVER1-NEXT: vrcpss %xmm1, %xmm1, %xmm1 # sched: [12:0.50]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = insertelement <4 x float> undef, float %a0, i32 0
%2 = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %1)
%3 = load float, float *%a1, align 4
@ -1782,6 +2016,13 @@ define <4 x float> @test_rsqrtps(<4 x float> %a0, <4 x float> *%a1) {
; BTVER2-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_rsqrtps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vrsqrtps (%rdi), %xmm1 # sched: [12:0.50]
; ZNVER1-NEXT: vrsqrtps %xmm0, %xmm0 # sched: [5:0.50]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0)
%2 = load <4 x float>, <4 x float> *%a1, align 16
%3 = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %2)
@ -1840,6 +2081,14 @@ define <4 x float> @test_rsqrtss(float %a0, float *%a1) {
; BTVER2-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [7:1.00]
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_rsqrtss:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero sched: [8:0.50]
; ZNVER1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 # sched: [12:0.50]
; ZNVER1-NEXT: vrsqrtss %xmm1, %xmm1, %xmm1 # sched: [12:0.50]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = insertelement <4 x float> undef, float %a0, i32 0
%2 = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %1)
%3 = load float, float *%a1, align 4
@ -1886,6 +2135,11 @@ define void @test_sfence() {
; BTVER2: # BB#0:
; BTVER2-NEXT: sfence # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_sfence:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: sfence # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
call void @llvm.x86.sse.sfence()
ret void
}
@ -1931,6 +2185,12 @@ define <4 x float> @test_shufps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
; BTVER2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:0.50]
; BTVER2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_shufps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,0] sched: [1:0.50]
; ZNVER1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,3],mem[0,0] sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 0, i32 4, i32 4>
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 3, i32 4, i32 4>
@ -1980,6 +2240,13 @@ define <4 x float> @test_sqrtps(<4 x float> %a0, <4 x float> *%a1) {
; BTVER2-NEXT: vsqrtps %xmm0, %xmm0 # sched: [21:21.00]
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_sqrtps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vsqrtps (%rdi), %xmm1 # sched: [27:1.00]
; ZNVER1-NEXT: vsqrtps %xmm0, %xmm0 # sched: [20:1.00]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0)
%2 = load <4 x float>, <4 x float> *%a1, align 16
%3 = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %2)
@ -2038,6 +2305,14 @@ define <4 x float> @test_sqrtss(<4 x float> %a0, <4 x float> *%a1) {
; BTVER2-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [26:21.00]
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_sqrtss:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovaps (%rdi), %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 # sched: [27:1.00]
; ZNVER1-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 # sched: [27:1.00]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0)
%2 = load <4 x float>, <4 x float> *%a1, align 16
%3 = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %2)
@ -2082,6 +2357,12 @@ define i32 @test_stmxcsr() {
; BTVER2-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:1.00]
; BTVER2-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [5:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_stmxcsr:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vstmxcsr -{{[0-9]+}}(%rsp) # sched: [1:0.50]
; ZNVER1-NEXT: movl -{{[0-9]+}}(%rsp), %eax # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = alloca i32, align 4
%2 = bitcast i32* %1 to i8*
call void @llvm.x86.sse.stmxcsr(i8* %2)
@ -2126,6 +2407,12 @@ define <4 x float> @test_subps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; BTVER2-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_subps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fsub <4 x float> %a0, %a1
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = fsub <4 x float> %1, %2
@ -2168,6 +2455,12 @@ define float @test_subss(float %a0, float %a1, float *%a2) {
; BTVER2-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_subss:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vsubss %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vsubss (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = fsub float %a0, %a1
%2 = load float, float *%a2, align 4
%3 = fsub float %1, %2
@ -2258,6 +2551,20 @@ define i32 @test_ucomiss(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) {
; BTVER2-NEXT: orb %cl, %dl # sched: [1:0.50]
; BTVER2-NEXT: movzbl %dl, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_ucomiss:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vucomiss %xmm1, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: setnp %al # sched: [1:0.25]
; ZNVER1-NEXT: sete %cl # sched: [1:0.25]
; ZNVER1-NEXT: andb %al, %cl # sched: [1:0.25]
; ZNVER1-NEXT: vucomiss (%rdi), %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: setnp %al # sched: [1:0.25]
; ZNVER1-NEXT: sete %dl # sched: [1:0.25]
; ZNVER1-NEXT: andb %al, %dl # sched: [1:0.25]
; ZNVER1-NEXT: orb %cl, %dl # sched: [1:0.25]
; ZNVER1-NEXT: movzbl %dl, %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1)
%2 = load <4 x float>, <4 x float> *%a2, align 4
%3 = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %2)
@ -2306,6 +2613,12 @@ define <4 x float> @test_unpckhps(<4 x float> %a0, <4 x float> %a1, <4 x float>
; BTVER2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
; BTVER2-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_unpckhps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] sched: [1:0.50]
; ZNVER1-NEXT: vunpckhps {{.*#+}} xmm0 = xmm0[2],mem[2],xmm0[3],mem[3] sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
@ -2352,6 +2665,12 @@ define <4 x float> @test_unpcklps(<4 x float> %a0, <4 x float> %a1, <4 x float>
; BTVER2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
; BTVER2-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_unpcklps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] sched: [1:0.50]
; ZNVER1-NEXT: vunpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
@ -2402,6 +2721,12 @@ define <4 x float> @test_xorps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a
; BTVER2-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_xorps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vxorps %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vxorps (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = bitcast <4 x float> %a0 to <4 x i32>
%2 = bitcast <4 x float> %a1 to <4 x i32>
%3 = xor <4 x i32> %1, %2

File diff suppressed because it is too large Load Diff

View File

@ -7,7 +7,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_addsubpd:
@ -45,6 +45,12 @@ define <2 x double> @test_addsubpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
; BTVER2-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_addsubpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vaddsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vaddsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1)
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %1, <2 x double> %2)
@ -88,6 +94,12 @@ define <4 x float> @test_addsubps(<4 x float> %a0, <4 x float> %a1, <4 x float>
; BTVER2-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_addsubps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vaddsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vaddsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1)
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %1, <4 x float> %2)
@ -131,6 +143,12 @@ define <2 x double> @test_haddpd(<2 x double> %a0, <2 x double> %a1, <2 x double
; BTVER2-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_haddpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vhaddpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1)
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %1, <2 x double> %2)
@ -174,6 +192,12 @@ define <4 x float> @test_haddps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
; BTVER2-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_haddps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vhaddps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1)
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %1, <4 x float> %2)
@ -217,6 +241,12 @@ define <2 x double> @test_hsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double
; BTVER2-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_hsubpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vhsubpd (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1)
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %1, <2 x double> %2)
@ -260,6 +290,12 @@ define <4 x float> @test_hsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
; BTVER2-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_hsubps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vhsubps (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1)
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %1, <4 x float> %2)
@ -299,6 +335,11 @@ define <16 x i8> @test_lddqu(i8* %a0) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vlddqu (%rdi), %xmm0 # sched: [5:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_lddqu:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vlddqu (%rdi), %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %a0)
ret <16 x i8> %1
}
@ -347,6 +388,13 @@ define <2 x double> @test_movddup(<2 x double> %a0, <2 x double> *%a1) {
; BTVER2-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:0.50]
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movddup:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovddup {{.*#+}} xmm1 = mem[0,0] sched: [8:0.50]
; ZNVER1-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0] sched: [1:0.50]
; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer
%2 = load <2 x double>, <2 x double> *%a1, align 16
%3 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
@ -397,6 +445,13 @@ define <4 x float> @test_movshdup(<4 x float> %a0, <4 x float> *%a1) {
; BTVER2-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:0.50]
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movshdup:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovshdup {{.*#+}} xmm1 = mem[1,1,3,3] sched: [8:0.50]
; ZNVER1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] sched: [1:0.50]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
%2 = load <4 x float>, <4 x float> *%a1, align 16
%3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 1, i32 1, i32 3, i32 3>
@ -447,6 +502,13 @@ define <4 x float> @test_movsldup(<4 x float> %a0, <4 x float> *%a1) {
; BTVER2-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:0.50]
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movsldup:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovsldup {{.*#+}} xmm1 = mem[0,0,2,2] sched: [8:0.50]
; ZNVER1-NEXT: vmovsldup {{.*#+}} xmm0 = xmm0[0,0,2,2] sched: [1:0.50]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
%2 = load <4 x float>, <4 x float> *%a1, align 16
%3 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> <i32 0, i32 0, i32 2, i32 2>

View File

@ -6,7 +6,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) {
; GENERIC-LABEL: test_blendpd:
@ -43,6 +43,13 @@ define <2 x double> @test_blendpd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blendpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],xmm1[1] sched: [1:0.50]
; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vblendpd {{.*#+}} xmm0 = xmm0[0],mem[1] sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <2 x double> %a0, <2 x double> %a1, <2 x i32> <i32 0, i32 3>
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = fadd <2 x double> %a1, %1
@ -80,6 +87,12 @@ define <4 x float> @test_blendps(<4 x float> %a0, <4 x float> %a1, <4 x float> *
; BTVER2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
; BTVER2-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blendps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] sched: [1:0.50]
; ZNVER1-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2,3] sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = shufflevector <4 x float> %1, <4 x float> %2, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
@ -122,6 +135,12 @@ define <2 x double> @test_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x doub
; BTVER2-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blendvpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; ZNVER1-NEXT: vblendvpd %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
%2 = load <2 x double>, <2 x double> *%a3, align 16
%3 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %1, <2 x double> %2, <2 x double> %a2)
@ -165,6 +184,12 @@ define <4 x float> @test_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float>
; BTVER2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_blendvps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; ZNVER1-NEXT: vblendvps %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
%2 = load <4 x float>, <4 x float> *%a3
%3 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %1, <4 x float> %2, <4 x float> %a2)
@ -202,6 +227,12 @@ define <2 x double> @test_dppd(<2 x double> %a0, <2 x double> %a1, <2 x double>
; BTVER2-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_dppd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vdppd $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vdppd $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7)
%2 = load <2 x double>, <2 x double> *%a2, align 16
%3 = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %1, <2 x double> %2, i8 7)
@ -239,6 +270,12 @@ define <4 x float> @test_dpps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2
; BTVER2-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_dpps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vdpps $7, %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vdpps $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7)
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %1, <4 x float> %2, i8 7)
@ -276,6 +313,12 @@ define <4 x float> @test_insertps(<4 x float> %a0, <4 x float> %a1, float *%a2)
; BTVER2-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:0.50]
; BTVER2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_insertps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vinsertps {{.*#+}} xmm0 = zero,xmm1[0],xmm0[2,3] sched: [1:0.50]
; ZNVER1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 17)
%2 = load float, float *%a2
%3 = insertelement <4 x float> %1, float %2, i32 3
@ -308,6 +351,11 @@ define <2 x i64> @test_movntdqa(i8* %a0) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [5:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movntdqa:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmovntdqa (%rdi), %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <2 x i64> @llvm.x86.sse41.movntdqa(i8* %a0)
ret <2 x i64> %1
}
@ -343,6 +391,12 @@ define <8 x i16> @test_mpsadbw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [3:2.00]
; BTVER2-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [8:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_mpsadbw:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vmpsadbw $7, %xmm1, %xmm0, %xmm0 # sched: [100:0.00]
; ZNVER1-NEXT: vmpsadbw $7, (%rdi), %xmm0, %xmm0 # sched: [100:0.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7)
%2 = bitcast <8 x i16> %1 to <16 x i8>
%3 = load <16 x i8>, <16 x i8> *%a2, align 16
@ -381,6 +435,12 @@ define <8 x i16> @test_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; BTVER2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_packusdw:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpackusdw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
%2 = bitcast <8 x i16> %1 to <4 x i32>
%3 = load <4 x i32>, <4 x i32> *%a2, align 16
@ -425,6 +485,12 @@ define <16 x i8> @test_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2, <16
; BTVER2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pblendvb:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 # sched: [1:1.00]
; ZNVER1-NEXT: vpblendvb %xmm2, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2)
%2 = load <16 x i8>, <16 x i8> *%a3, align 16
%3 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %1, <16 x i8> %2, <16 x i8> %a2)
@ -462,6 +528,12 @@ define <8 x i16> @test_pblendw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
; BTVER2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pblendw:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7] sched: [1:0.50]
; ZNVER1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3],xmm0[4,5,6],mem[7] sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = shufflevector <8 x i16> %1, <8 x i16> %2, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
@ -498,6 +570,12 @@ define <2 x i64> @test_pcmpeqq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; BTVER2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pcmpeqq:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpcmpeqq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = icmp eq <2 x i64> %a0, %a1
%2 = sext <2 x i1> %1 to <2 x i64>
%3 = load <2 x i64>, <2 x i64>*%a2, align 16
@ -536,6 +614,12 @@ define i32 @test_pextrb(<16 x i8> %a0, i8 *%a1) {
; BTVER2-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:0.50]
; BTVER2-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pextrb:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpextrb $3, %xmm0, %eax # sched: [1:0.25]
; ZNVER1-NEXT: vpextrb $1, %xmm0, (%rdi) # sched: [8:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = extractelement <16 x i8> %a0, i32 3
%2 = extractelement <16 x i8> %a0, i32 1
store i8 %2, i8 *%a1
@ -573,6 +657,12 @@ define i32 @test_pextrd(<4 x i32> %a0, i32 *%a1) {
; BTVER2-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:0.50]
; BTVER2-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pextrd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpextrd $3, %xmm0, %eax # sched: [1:0.25]
; ZNVER1-NEXT: vpextrd $1, %xmm0, (%rdi) # sched: [8:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = extractelement <4 x i32> %a0, i32 3
%2 = extractelement <4 x i32> %a0, i32 1
store i32 %2, i32 *%a1
@ -609,6 +699,12 @@ define i64 @test_pextrq(<2 x i64> %a0, <2 x i64> %a1, i64 *%a2) {
; BTVER2-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:0.50]
; BTVER2-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pextrq:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpextrq $1, %xmm0, %rax # sched: [1:0.25]
; ZNVER1-NEXT: vpextrq $1, %xmm0, (%rdi) # sched: [8:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = extractelement <2 x i64> %a0, i32 1
%2 = extractelement <2 x i64> %a0, i32 1
store i64 %2, i64 *%a2
@ -645,6 +741,12 @@ define i32 @test_pextrw(<8 x i16> %a0, i16 *%a1) {
; BTVER2-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:0.50]
; BTVER2-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pextrw:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpextrw $3, %xmm0, %eax # sched: [1:0.25]
; ZNVER1-NEXT: vpextrw $1, %xmm0, (%rdi) # sched: [8:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = extractelement <8 x i16> %a0, i32 3
%2 = extractelement <8 x i16> %a0, i32 1
store i16 %2, i16 *%a1
@ -682,6 +784,12 @@ define <8 x i16> @test_phminposuw(<8 x i16> *%a0) {
; BTVER2-NEXT: vphminposuw (%rdi), %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: vphminposuw %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_phminposuw:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vphminposuw (%rdi), %xmm0 # sched: [11:1.00]
; ZNVER1-NEXT: vphminposuw %xmm0, %xmm0 # sched: [4:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = load <8 x i16>, <8 x i16> *%a0, align 16
%2 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %1)
%3 = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %2)
@ -719,6 +827,12 @@ define <16 x i8> @test_pinsrb(<16 x i8> %a0, i8 %a1, i8 *%a2) {
; BTVER2-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pinsrb:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpinsrb $1, %edi, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpinsrb $3, (%rsi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = insertelement <16 x i8> %a0, i8 %a1, i32 1
%2 = load i8, i8 *%a2
%3 = insertelement <16 x i8> %1, i8 %2, i32 3
@ -755,6 +869,12 @@ define <4 x i32> @test_pinsrd(<4 x i32> %a0, i32 %a1, i32 *%a2) {
; BTVER2-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pinsrd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpinsrd $1, %edi, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpinsrd $3, (%rsi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = insertelement <4 x i32> %a0, i32 %a1, i32 1
%2 = load i32, i32 *%a2
%3 = insertelement <4 x i32> %1, i32 %2, i32 3
@ -796,6 +916,13 @@ define <2 x i64> @test_pinsrq(<2 x i64> %a0, <2 x i64> %a1, i64 %a2, i64 *%a3) {
; BTVER2-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pinsrq:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpinsrq $1, (%rsi), %xmm1, %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = insertelement <2 x i64> %a0, i64 %a2, i32 1
%2 = load i64, i64 *%a3
%3 = insertelement <2 x i64> %a1, i64 %2, i32 1
@ -833,6 +960,12 @@ define <16 x i8> @test_pmaxsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmaxsb:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpmaxsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpmaxsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %1, <16 x i8> %2)
@ -870,6 +1003,12 @@ define <4 x i32> @test_pmaxsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; BTVER2-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmaxsd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpmaxsd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpmaxsd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1)
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %1, <4 x i32> %2)
@ -907,6 +1046,12 @@ define <4 x i32> @test_pmaxud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; BTVER2-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmaxud:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpmaxud %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpmaxud (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1)
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %1, <4 x i32> %2)
@ -944,6 +1089,12 @@ define <8 x i16> @test_pmaxuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmaxuw:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpmaxuw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpmaxuw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %1, <8 x i16> %2)
@ -981,6 +1132,12 @@ define <16 x i8> @test_pminsb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pminsb:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpminsb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpminsb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %1, <16 x i8> %2)
@ -1018,6 +1175,12 @@ define <4 x i32> @test_pminsd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; BTVER2-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pminsd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpminsd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpminsd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1)
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %1, <4 x i32> %2)
@ -1055,6 +1218,12 @@ define <4 x i32> @test_pminud(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; BTVER2-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pminud:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpminud %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpminud (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1)
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %1, <4 x i32> %2)
@ -1092,6 +1261,12 @@ define <8 x i16> @test_pminuw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pminuw:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpminuw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpminuw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %1, <8 x i16> %2)
@ -1135,6 +1310,13 @@ define <8 x i16> @test_pmovsxbw(<16 x i8> %a0, <8 x i8> *%a1) {
; BTVER2-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovsxbw:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpmovsxbw (%rdi), %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpmovsxbw %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = sext <8 x i8> %1 to <8 x i16>
%3 = load <8 x i8>, <8 x i8>* %a1, align 1
@ -1179,6 +1361,13 @@ define <4 x i32> @test_pmovsxbd(<16 x i8> %a0, <4 x i8> *%a1) {
; BTVER2-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovsxbd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpmovsxbd (%rdi), %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpmovsxbd %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = sext <4 x i8> %1 to <4 x i32>
%3 = load <4 x i8>, <4 x i8>* %a1, align 1
@ -1223,6 +1412,13 @@ define <2 x i64> @test_pmovsxbq(<16 x i8> %a0, <2 x i8> *%a1) {
; BTVER2-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovsxbq:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpmovsxbq (%rdi), %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpmovsxbq %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
%2 = sext <2 x i8> %1 to <2 x i64>
%3 = load <2 x i8>, <2 x i8>* %a1, align 1
@ -1267,6 +1463,13 @@ define <2 x i64> @test_pmovsxdq(<4 x i32> %a0, <2 x i32> *%a1) {
; BTVER2-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovsxdq:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpmovsxdq (%rdi), %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpmovsxdq %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%2 = sext <2 x i32> %1 to <2 x i64>
%3 = load <2 x i32>, <2 x i32>* %a1, align 1
@ -1311,6 +1514,13 @@ define <4 x i32> @test_pmovsxwd(<8 x i16> %a0, <4 x i16> *%a1) {
; BTVER2-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovsxwd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpmovsxwd (%rdi), %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpmovsxwd %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = sext <4 x i16> %1 to <4 x i32>
%3 = load <4 x i16>, <4 x i16>* %a1, align 1
@ -1355,6 +1565,13 @@ define <2 x i64> @test_pmovsxwq(<8 x i16> %a0, <2 x i16> *%a1) {
; BTVER2-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovsxwq:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpmovsxwq (%rdi), %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpmovsxwq %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
%2 = sext <2 x i16> %1 to <2 x i64>
%3 = load <2 x i16>, <2 x i16>* %a1, align 1
@ -1399,6 +1616,13 @@ define <8 x i16> @test_pmovzxbw(<16 x i8> %a0, <8 x i8> *%a1) {
; BTVER2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.50]
; BTVER2-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovzxbw:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero sched: [8:0.50]
; ZNVER1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero sched: [1:0.25]
; ZNVER1-NEXT: vpaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%2 = zext <8 x i8> %1 to <8 x i16>
%3 = load <8 x i8>, <8 x i8>* %a1, align 1
@ -1443,6 +1667,13 @@ define <4 x i32> @test_pmovzxbd(<16 x i8> %a0, <4 x i8> *%a1) {
; BTVER2-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.50]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovzxbd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpmovzxbd {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero sched: [8:0.50]
; ZNVER1-NEXT: vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero sched: [1:0.25]
; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = zext <4 x i8> %1 to <4 x i32>
%3 = load <4 x i8>, <4 x i8>* %a1, align 1
@ -1487,6 +1718,13 @@ define <2 x i64> @test_pmovzxbq(<16 x i8> %a0, <2 x i8> *%a1) {
; BTVER2-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.50]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovzxbq:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero sched: [8:0.50]
; ZNVER1-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero sched: [1:0.25]
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
%2 = zext <2 x i8> %1 to <2 x i64>
%3 = load <2 x i8>, <2 x i8>* %a1, align 1
@ -1531,6 +1769,13 @@ define <2 x i64> @test_pmovzxdq(<4 x i32> %a0, <2 x i32> *%a1) {
; BTVER2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.50]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovzxdq:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero sched: [8:0.50]
; ZNVER1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero sched: [1:0.25]
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
%2 = zext <2 x i32> %1 to <2 x i64>
%3 = load <2 x i32>, <2 x i32>* %a1, align 1
@ -1575,6 +1820,13 @@ define <4 x i32> @test_pmovzxwd(<8 x i16> %a0, <4 x i16> *%a1) {
; BTVER2-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.50]
; BTVER2-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovzxwd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero sched: [8:0.50]
; ZNVER1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero sched: [1:0.25]
; ZNVER1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = zext <4 x i16> %1 to <4 x i32>
%3 = load <4 x i16>, <4 x i16>* %a1, align 1
@ -1619,6 +1871,13 @@ define <2 x i64> @test_pmovzxwq(<8 x i16> %a0, <2 x i16> *%a1) {
; BTVER2-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.50]
; BTVER2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmovzxwq:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero sched: [8:0.50]
; ZNVER1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero sched: [1:0.25]
; ZNVER1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
%2 = zext <2 x i16> %1 to <2 x i64>
%3 = load <2 x i16>, <2 x i16>* %a1, align 1
@ -1657,6 +1916,12 @@ define <2 x i64> @test_pmuldq(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; BTVER2-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmuldq:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpmuldq %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
; ZNVER1-NEXT: vpmuldq (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1)
%2 = bitcast <2 x i64> %1 to <4 x i32>
%3 = load <4 x i32>, <4 x i32> *%a2, align 16
@ -1695,6 +1960,12 @@ define <4 x i32> @test_pmulld(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; BTVER2-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmulld:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpmulld %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
; ZNVER1-NEXT: vpmulld (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = mul <4 x i32> %a0, %a1
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = mul <4 x i32> %1, %2
@ -1751,6 +2022,16 @@ define i32 @test_ptest(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; BTVER2-NEXT: andb %al, %cl # sched: [1:0.50]
; BTVER2-NEXT: movzbl %cl, %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_ptest:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vptest %xmm1, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: setb %al # sched: [1:0.25]
; ZNVER1-NEXT: vptest (%rdi), %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: setb %cl # sched: [1:0.25]
; ZNVER1-NEXT: andb %al, %cl # sched: [1:0.25]
; ZNVER1-NEXT: movzbl %cl, %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
%2 = load <2 x i64>, <2 x i64> *%a2, align 16
%3 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %2)
@ -1795,6 +2076,13 @@ define <2 x double> @test_roundpd(<2 x double> %a0, <2 x double> *%a1) {
; BTVER2-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_roundpd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vroundpd $7, (%rdi), %xmm1 # sched: [10:1.00]
; ZNVER1-NEXT: vroundpd $7, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7)
%2 = load <2 x double>, <2 x double> *%a1, align 16
%3 = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %2, i32 7)
@ -1839,6 +2127,13 @@ define <4 x float> @test_roundps(<4 x float> %a0, <4 x float> *%a1) {
; BTVER2-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_roundps:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vroundps $7, (%rdi), %xmm1 # sched: [10:1.00]
; ZNVER1-NEXT: vroundps $7, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: vaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7)
%2 = load <4 x float>, <4 x float> *%a1, align 16
%3 = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %2, i32 7)
@ -1884,6 +2179,13 @@ define <2 x double> @test_roundsd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
; BTVER2-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_roundsd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vroundsd $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
; ZNVER1-NEXT: vroundsd $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: vaddpd %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7)
%2 = load <2 x double>, <2 x double>* %a2, align 16
%3 = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %2, i32 7)
@ -1929,6 +2231,13 @@ define <4 x float> @test_roundss(<4 x float> %a0, <4 x float> %a1, <4 x float> *
; BTVER2-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
; BTVER2-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_roundss:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vroundss $7, %xmm1, %xmm0, %xmm1 # sched: [3:1.00]
; ZNVER1-NEXT: vroundss $7, (%rdi), %xmm0, %xmm0 # sched: [10:1.00]
; ZNVER1-NEXT: vaddps %xmm0, %xmm1, %xmm0 # sched: [3:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7)
%2 = load <4 x float>, <4 x float> *%a2, align 16
%3 = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %2, i32 7)

View File

@ -6,7 +6,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) {
; GENERIC-LABEL: crc32_32_8:
@ -43,6 +43,13 @@ define i32 @crc32_32_8(i32 %a0, i8 %a1, i8 *%a2) {
; BTVER2-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: crc32_32_8:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: crc32b %sil, %edi # sched: [3:1.00]
; ZNVER1-NEXT: crc32b (%rdx), %edi # sched: [10:1.00]
; ZNVER1-NEXT: movl %edi, %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a0, i8 %a1)
%2 = load i8, i8 *%a2
%3 = call i32 @llvm.x86.sse42.crc32.32.8(i32 %1, i8 %2)
@ -85,6 +92,13 @@ define i32 @crc32_32_16(i32 %a0, i16 %a1, i16 *%a2) {
; BTVER2-NEXT: crc32w (%rdx), %edi # sched: [8:1.00]
; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: crc32_32_16:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: crc32w %si, %edi # sched: [3:1.00]
; ZNVER1-NEXT: crc32w (%rdx), %edi # sched: [10:1.00]
; ZNVER1-NEXT: movl %edi, %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a0, i16 %a1)
%2 = load i16, i16 *%a2
%3 = call i32 @llvm.x86.sse42.crc32.32.16(i32 %1, i16 %2)
@ -127,6 +141,13 @@ define i32 @crc32_32_32(i32 %a0, i32 %a1, i32 *%a2) {
; BTVER2-NEXT: crc32l (%rdx), %edi # sched: [8:1.00]
; BTVER2-NEXT: movl %edi, %eax # sched: [1:0.17]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: crc32_32_32:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: crc32l %esi, %edi # sched: [3:1.00]
; ZNVER1-NEXT: crc32l (%rdx), %edi # sched: [10:1.00]
; ZNVER1-NEXT: movl %edi, %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a0, i32 %a1)
%2 = load i32, i32 *%a2
%3 = call i32 @llvm.x86.sse42.crc32.32.32(i32 %1, i32 %2)
@ -169,6 +190,13 @@ define i64 @crc32_64_8(i64 %a0, i8 %a1, i8 *%a2) nounwind {
; BTVER2-NEXT: crc32b (%rdx), %edi # sched: [8:1.00]
; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.17]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: crc32_64_8:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: crc32b %sil, %edi # sched: [3:1.00]
; ZNVER1-NEXT: crc32b (%rdx), %edi # sched: [10:1.00]
; ZNVER1-NEXT: movq %rdi, %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call i64 @llvm.x86.sse42.crc32.64.8(i64 %a0, i8 %a1)
%2 = load i8, i8 *%a2
%3 = call i64 @llvm.x86.sse42.crc32.64.8(i64 %1, i8 %2)
@ -211,6 +239,13 @@ define i64 @crc32_64_64(i64 %a0, i64 %a1, i64 *%a2) {
; BTVER2-NEXT: crc32q (%rdx), %rdi # sched: [8:1.00]
; BTVER2-NEXT: movq %rdi, %rax # sched: [1:0.17]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: crc32_64_64:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: crc32q %rsi, %rdi # sched: [3:1.00]
; ZNVER1-NEXT: crc32q (%rdx), %rdi # sched: [10:1.00]
; ZNVER1-NEXT: movq %rdi, %rax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a0, i64 %a1)
%2 = load i64, i64 *%a2
%3 = call i64 @llvm.x86.sse42.crc32.64.64(i64 %1, i64 %2)
@ -283,6 +318,19 @@ define i32 @test_pcmpestri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
; BTVER2-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pcmpestri:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: movl $7, %eax # sched: [1:0.25]
; ZNVER1-NEXT: movl $7, %edx # sched: [1:0.25]
; ZNVER1-NEXT: vpcmpestri $7, %xmm1, %xmm0 # sched: [100:0.00]
; ZNVER1-NEXT: movl $7, %eax # sched: [1:0.25]
; ZNVER1-NEXT: movl $7, %edx # sched: [1:0.25]
; ZNVER1-NEXT: movl %ecx, %esi # sched: [1:0.25]
; ZNVER1-NEXT: vpcmpestri $7, (%rdi), %xmm0 # sched: [100:0.00]
; ZNVER1-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
; ZNVER1-NEXT: leal (%rcx,%rsi), %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %2, i32 7, i8 7)
@ -341,6 +389,16 @@ define <16 x i8> @test_pcmpestrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: movl $7, %edx # sched: [1:0.17]
; BTVER2-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [18:2.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pcmpestrm:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: movl $7, %eax # sched: [1:0.25]
; ZNVER1-NEXT: movl $7, %edx # sched: [1:0.25]
; ZNVER1-NEXT: vpcmpestrm $7, %xmm1, %xmm0 # sched: [100:0.00]
; ZNVER1-NEXT: movl $7, %eax # sched: [1:0.25]
; ZNVER1-NEXT: movl $7, %edx # sched: [1:0.25]
; ZNVER1-NEXT: vpcmpestrm $7, (%rdi), %xmm0 # sched: [100:0.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a1, i32 7, i8 7)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %1, i32 7, <16 x i8> %2, i32 7, i8 7)
@ -393,6 +451,15 @@ define i32 @test_pcmpistri(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
; BTVER2-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pcmpistri:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpcmpistri $7, %xmm1, %xmm0 # sched: [100:0.00]
; ZNVER1-NEXT: movl %ecx, %eax # sched: [1:0.25]
; ZNVER1-NEXT: vpcmpistri $7, (%rdi), %xmm0 # sched: [100:0.00]
; ZNVER1-NEXT: # kill: %ECX<def> %ECX<kill> %RCX<def>
; ZNVER1-NEXT: leal (%rcx,%rax), %eax # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %2, i8 7)
@ -431,6 +498,12 @@ define <16 x i8> @test_pcmpistrm(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [12:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pcmpistrm:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpcmpistrm $7, %xmm1, %xmm0 # sched: [100:0.00]
; ZNVER1-NEXT: vpcmpistrm $7, (%rdi), %xmm0 # sched: [100:0.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %1, <16 x i8> %2, i8 7)
@ -468,6 +541,12 @@ define <2 x i64> @test_pcmpgtq(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> *%a2) {
; BTVER2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pcmpgtq:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpcmpgtq (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = icmp sgt <2 x i64> %a0, %a1
%2 = sext <2 x i1> %1 to <2 x i64>
%3 = load <2 x i64>, <2 x i64>*%a2, align 16

View File

@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mattr=+sse4a | FileCheck %s --check-prefix=GENERIC
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=ZNVER1
define <2 x i64> @test_extrq(<2 x i64> %a0, <16 x i8> %a1) {
; GENERIC-LABEL: test_extrq:
@ -13,6 +13,11 @@ define <2 x i64> @test_extrq(<2 x i64> %a0, <16 x i8> %a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: extrq %xmm1, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_extrq:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: extrq %xmm1, %xmm0 # sched: [?:0.000000e+00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = tail call <2 x i64> @llvm.x86.sse4a.extrq(<2 x i64> %a0, <16 x i8> %a1)
ret <2 x i64> %1
}
@ -28,6 +33,11 @@ define <2 x i64> @test_extrqi(<2 x i64> %a0) {
; BTVER2: # BB#0:
; BTVER2-NEXT: extrq $2, $3, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_extrqi:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: extrq $2, $3, %xmm0 # sched: [?:0.000000e+00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = tail call <2 x i64> @llvm.x86.sse4a.extrqi(<2 x i64> %a0, i8 3, i8 2)
ret <2 x i64> %1
}
@ -43,6 +53,11 @@ define <2 x i64> @test_insertq(<2 x i64> %a0, <2 x i64> %a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: insertq %xmm1, %xmm0 # sched: [2:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_insertq:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: insertq %xmm1, %xmm0 # sched: [?:0.000000e+00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertq(<2 x i64> %a0, <2 x i64> %a1)
ret <2 x i64> %1
}
@ -58,6 +73,11 @@ define <2 x i64> @test_insertqi(<2 x i64> %a0, <2 x i64> %a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: insertq $6, $5, %xmm1, %xmm0 # sched: [2:2.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_insertqi:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: insertq $6, $5, %xmm1, %xmm0 # sched: [?:0.000000e+00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = tail call <2 x i64> @llvm.x86.sse4a.insertqi(<2 x i64> %a0, <2 x i64> %a1, i8 5, i8 6)
ret <2 x i64> %1
}
@ -73,6 +93,11 @@ define void @test_movntsd(i8* %p, <2 x double> %a) {
; BTVER2: # BB#0:
; BTVER2-NEXT: movntsd %xmm0, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movntsd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: movntsd %xmm0, (%rdi) # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
tail call void @llvm.x86.sse4a.movnt.sd(i8* %p, <2 x double> %a)
ret void
}
@ -88,6 +113,11 @@ define void @test_movntss(i8* %p, <4 x float> %a) {
; BTVER2: # BB#0:
; BTVER2-NEXT: movntss %xmm0, (%rdi) # sched: [1:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_movntss:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: movntss %xmm0, (%rdi) # sched: [1:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a)
ret void
}

View File

@ -7,7 +7,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=haswell | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=skylake | FileCheck %s --check-prefix=CHECK --check-prefix=HASWELL
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=btver2 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=BTVER2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -print-schedule -mcpu=znver1 | FileCheck %s --check-prefix=CHECK --check-prefix=ZNVER1
define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) {
; GENERIC-LABEL: test_pabsb:
@ -52,6 +52,13 @@ define <16 x i8> @test_pabsb(<16 x i8> %a0, <16 x i8> *%a1) {
; BTVER2-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pabsb:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpabsb (%rdi), %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpabsb %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0)
%2 = load <16 x i8>, <16 x i8> *%a1, align 16
%3 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %2)
@ -103,6 +110,13 @@ define <4 x i32> @test_pabsd(<4 x i32> %a0, <4 x i32> *%a1) {
; BTVER2-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pabsd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpabsd (%rdi), %xmm1 # sched: [8:0.50]
; ZNVER1-NEXT: vpabsd %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpor %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %a0)
%2 = load <4 x i32>, <4 x i32> *%a1, align 16
%3 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %2)
@ -147,6 +161,11 @@ define <8 x i16> @test_pabsw(<8 x i16> %a0, <8 x i16> *%a1) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pabsw:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpabsw %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %a0)
%2 = load <8 x i16>, <8 x i16> *%a1, align 16
%3 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %2)
@ -196,6 +215,12 @@ define <8 x i16> @test_palignr(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.50]
; BTVER2-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_palignr:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5] sched: [1:0.25]
; ZNVER1-NEXT: vpalignr {{.*#+}} xmm0 = mem[14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13] sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = shufflevector <8 x i16> %a0, <8 x i16> %a1, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = shufflevector <8 x i16> %2, <8 x i16> %1, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
@ -238,6 +263,12 @@ define <4 x i32> @test_phaddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; BTVER2-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_phaddd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vphaddd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1)
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %1, <4 x i32> %2)
@ -289,6 +320,12 @@ define <8 x i16> @test_phaddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_phaddsw:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vphaddsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %1, <8 x i16> %2)
@ -332,6 +369,12 @@ define <8 x i16> @test_phaddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_phaddw:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vphaddw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %1, <8 x i16> %2)
@ -375,6 +418,12 @@ define <4 x i32> @test_phsubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; BTVER2-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_phsubd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vphsubd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1)
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %1, <4 x i32> %2)
@ -426,6 +475,12 @@ define <8 x i16> @test_phsubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_phsubsw:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vphsubsw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %1, <8 x i16> %2)
@ -469,6 +524,12 @@ define <8 x i16> @test_phsubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_phsubw:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vphsubw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %1, <8 x i16> %2)
@ -512,6 +573,12 @@ define <8 x i16> @test_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmaddubsw:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
; ZNVER1-NEXT: vpmaddubsw (%rdi), %xmm0, %xmm0 # sched: [11:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = bitcast <8 x i16> %1 to <16 x i8>
@ -550,6 +617,11 @@ define <8 x i16> @test_pmulhrsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2: # BB#0:
; BTVER2-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [2:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pmulhrsw:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpmulhrsw %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %1, <8 x i16> %2)
@ -593,6 +665,12 @@ define <16 x i8> @test_pshufb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_pshufb:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpshufb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> %2)
@ -644,6 +722,12 @@ define <16 x i8> @test_psignb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> *%a2) {
; BTVER2-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psignb:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpsignb %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpsignb (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1)
%2 = load <16 x i8>, <16 x i8> *%a2, align 16
%3 = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %1, <16 x i8> %2)
@ -695,6 +779,12 @@ define <4 x i32> @test_psignd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
; BTVER2-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psignd:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpsignd %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpsignd (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1)
%2 = load <4 x i32>, <4 x i32> *%a2, align 16
%3 = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %1, <4 x i32> %2)
@ -746,6 +836,12 @@ define <8 x i16> @test_psignw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
; BTVER2-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
; BTVER2-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
; BTVER2-NEXT: retq # sched: [4:1.00]
;
; ZNVER1-LABEL: test_psignw:
; ZNVER1: # BB#0:
; ZNVER1-NEXT: vpsignw %xmm1, %xmm0, %xmm0 # sched: [1:0.25]
; ZNVER1-NEXT: vpsignw (%rdi), %xmm0, %xmm0 # sched: [8:0.50]
; ZNVER1-NEXT: retq # sched: [5:0.50]
%1 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1)
%2 = load <8 x i16>, <8 x i16> *%a2, align 16
%3 = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %1, <8 x i16> %2)