1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 11:02:59 +02:00

[X86] Scalar mask and scalar move optimizations

1. Introduction of mask scalar TableGen patterns.
2. Introduction of new scalar move TableGen patterns
   and refactoring of existing ones.
3. Folding of pattern created by introducing scalar
   masking in Clang header files.

Patch by tkrupa

Differential Revision: https://reviews.llvm.org/D47012

llvm-svn: 333419
This commit is contained in:
Alexander Ivchenko 2018-05-29 14:27:11 +00:00
parent 9da24fa24d
commit 94fafebe3a
6 changed files with 319 additions and 249 deletions

View File

@ -32133,6 +32133,27 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
}
// Some mask scalar intrinsics rely on checking if only one bit is set
// and implement it in C code like this:
// A[0] = (U & 1) ? A[0] : W[0];
// This creates some redundant instructions that break pattern matching.
// fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y)
if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT &&
Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) {
ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
SDValue AndNode = Cond.getOperand(0);
if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ &&
isNullConstant(Cond.getOperand(1)) &&
isa<ConstantSDNode>(AndNode.getOperand(1)) &&
cast<ConstantSDNode>(AndNode.getOperand(1))->getAPIntValue() == 1) {
// LHS and RHS swapped due to
// setcc outputting 1 when AND resulted in 0 and vice versa.
if (AndNode.getValueType() != MVT::i8)
AndNode = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, AndNode);
return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS);
}
}
// v16i8 (select v16i1, v16i8, v16i8) does not have a proper
// lowering on KNL. In this case we convert it to
// v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.

View File

@ -6697,6 +6697,63 @@ defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86Fnmadds1,
defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86Fnmsubs1,
X86FnmsubRnds1, X86Fnmsubs3, X86FnmsubRnds3>;
multiclass avx512_scalar_fma_patterns<SDNode Op, string Prefix, string Suffix, SDNode Move,
ValueType VT, ValueType EltVT, PatLeaf ZeroFP> {
let Predicates = [HasFMA, HasAVX512] in {
def : Pat<(VT (Move (VT VR128:$src2), (VT (scalar_to_vector
(Op (EltVT (extractelt (VT VR128:$src1), (iPTR 0))),
(EltVT (extractelt (VT VR128:$src2), (iPTR 0))),
(EltVT (extractelt (VT VR128:$src3), (iPTR 0)))))))),
(!cast<I>(Prefix#"213"#Suffix#"Zr_Int")
VR128:$src2, VR128:$src1, VR128:$src3)>;
def : Pat<(VT (Move (VT VR128:$src2), (VT (scalar_to_vector
(X86selects VK1WM:$mask,
(Op (EltVT (extractelt (VT VR128:$src1), (iPTR 0))),
(EltVT (extractelt (VT VR128:$src2), (iPTR 0))),
(EltVT (extractelt (VT VR128:$src3), (iPTR 0)))),
(EltVT (extractelt (VT VR128:$src2), (iPTR 0)))))))),
(!cast<I>(Prefix#"213"#Suffix#"Zr_Intk")
VR128:$src2, VK1WM:$mask, VR128:$src1, VR128:$src3)>;
def : Pat<(VT (Move (VT VR128:$src3), (VT (scalar_to_vector
(X86selects VK1WM:$mask,
(Op (EltVT (extractelt (VT VR128:$src1), (iPTR 0))),
(EltVT (extractelt (VT VR128:$src2), (iPTR 0))),
(EltVT (extractelt (VT VR128:$src3), (iPTR 0)))),
(EltVT (extractelt (VT VR128:$src3), (iPTR 0)))))))),
(!cast<I>(Prefix#"231"#Suffix#"Zr_Intk")
VR128:$src3, VK1WM:$mask, VR128:$src2, VR128:$src1)>;
def : Pat<(VT (Move (VT VR128:$src2), (VT (scalar_to_vector
(X86selects VK1WM:$mask,
(Op (EltVT (extractelt (VT VR128:$src1), (iPTR 0))),
(EltVT (extractelt (VT VR128:$src2), (iPTR 0))),
(EltVT (extractelt (VT VR128:$src3), (iPTR 0)))),
(EltVT ZeroFP)))))),
(!cast<I>(Prefix#"213"#Suffix#"Zr_Intkz")
VR128:$src2, VK1WM:$mask, VR128:$src1, VR128:$src3)>;
}
}
defm : avx512_scalar_fma_patterns<X86Fmadd, "VFMADD", "SS", X86Movss,
v4f32, f32, fp32imm0>;
defm : avx512_scalar_fma_patterns<X86Fmsub, "VFMSUB", "SS", X86Movss,
v4f32, f32, fp32imm0>;
defm : avx512_scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SS", X86Movss,
v4f32, f32, fp32imm0>;
defm : avx512_scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SS", X86Movss,
v4f32, f32, fp32imm0>;
defm : avx512_scalar_fma_patterns<X86Fmadd, "VFMADD", "SD", X86Movsd,
v2f64, f64, fp64imm0>;
defm : avx512_scalar_fma_patterns<X86Fmsub, "VFMSUB", "SD", X86Movsd,
v2f64, f64, fp64imm0>;
defm : avx512_scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SD", X86Movsd,
v2f64, f64, fp64imm0>;
defm : avx512_scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SD", X86Movsd,
v2f64, f64, fp64imm0>;
//===----------------------------------------------------------------------===//
// AVX-512 Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA
//===----------------------------------------------------------------------===//
@ -8499,6 +8556,42 @@ defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd",
VEX_W, AVX512AIi8Base, EVEX_4V,
EVEX_CD8<64, CD8VT1>;
multiclass avx512_masked_scalar<SDNode OpNode, string OpcPrefix, SDNode Move,
dag Mask, X86VectorVTInfo _, PatLeaf ZeroFP,
dag OutMask, Predicate BasePredicate> {
let Predicates = [BasePredicate] in {
def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask,
(OpNode (extractelt _.VT:$src2, (iPTR 0))),
(extractelt _.VT:$dst, (iPTR 0))))),
(!cast<Instruction>("V"#OpcPrefix#r_Intk)
_.VT:$dst, OutMask, _.VT:$src2, _.VT:$src1)>;
def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask,
(OpNode (extractelt _.VT:$src2, (iPTR 0))),
ZeroFP))),
(!cast<Instruction>("V"#OpcPrefix#r_Intkz)
OutMask, _.VT:$src2, _.VT:$src1)>;
}
}
multiclass avx512_masked_scalar_imm<SDNode OpNode, string OpcPrefix, SDNode Move,
dag Mask, X86VectorVTInfo _, PatLeaf ZeroFP,
bits<8> ImmV, dag OutMask,
Predicate BasePredicate> {
let Predicates = [BasePredicate] in {
def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask,
(OpNode (extractelt _.VT:$src2, (iPTR 0))),
(extractelt _.VT:$dst, (iPTR 0))))),
(!cast<Instruction>("V"#OpcPrefix#r_Intk)
_.VT:$dst, OutMask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>;
def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask,
(OpNode (extractelt _.VT:$src2, (iPTR 0))), ZeroFP))),
(!cast<Instruction>("V"#OpcPrefix#r_Intkz)
OutMask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>;
}
}
//-------------------------------------------------
// Integer truncate and extend operations
//-------------------------------------------------
@ -10847,69 +10940,54 @@ defm VFIXUPIMMPD : avx512_fixupimm_packed_all<SchedWriteFAdd, avx512vl_f64_info>
// TODO: Some canonicalization in lowering would simplify the number of
// patterns we have to try to match.
multiclass AVX512_scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode MoveNode,
X86VectorVTInfo _, PatLeaf ZeroFP> {
let Predicates = [HasAVX512] in {
// extracted scalar math op with insert via movss
def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), (v4f32 (scalar_to_vector
(Op (f32 (extractelt (v4f32 VR128X:$dst), (iPTR 0))),
FR32X:$src))))),
(!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst,
(COPY_TO_REGCLASS FR32X:$src, VR128X))>;
def : Pat<(_.VT (MoveNode (_.VT VR128X:$dst), (_.VT (scalar_to_vector
(Op (_.EltVT (extractelt (_.VT VR128X:$dst), (iPTR 0))),
_.FRC:$src))))),
(!cast<I>("V"#OpcPrefix#Zrr_Int) _.VT:$dst,
(COPY_TO_REGCLASS _.FRC:$src, VR128X))>;
// vector math op with insert via movss
def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst),
(Op (v4f32 VR128X:$dst), (v4f32 VR128X:$src)))),
(!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, v4f32:$src)>;
def : Pat<(_.VT (MoveNode (_.VT VR128X:$dst),
(Op (_.VT VR128X:$dst), (_.VT VR128X:$src)))),
(!cast<I>("V"#OpcPrefix#Zrr_Int) _.VT:$dst, _.VT:$src)>;
// extracted masked scalar math op with insert via movss
def : Pat<(X86Movss (v4f32 VR128X:$src1),
def : Pat<(MoveNode (_.VT VR128X:$src1),
(scalar_to_vector
(X86selects VK1WM:$mask,
(Op (f32 (extractelt (v4f32 VR128X:$src1), (iPTR 0))),
FR32X:$src2),
FR32X:$src0))),
(!cast<I>("V"#OpcPrefix#SSZrr_Intk) (COPY_TO_REGCLASS FR32X:$src0, VR128X),
VK1WM:$mask, v4f32:$src1,
(COPY_TO_REGCLASS FR32X:$src2, VR128X))>;
(Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
_.FRC:$src2),
_.FRC:$src0))),
(!cast<I>("V"#OpcPrefix#Zrr_Intk) (COPY_TO_REGCLASS _.FRC:$src0, VR128X),
VK1WM:$mask, _.VT:$src1,
(COPY_TO_REGCLASS _.FRC:$src2, VR128X))>;
// extracted masked scalar math op with insert via movss
def : Pat<(MoveNode (_.VT VR128X:$src1),
(scalar_to_vector
(X86selects VK1WM:$mask,
(Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
_.FRC:$src2), (_.EltVT ZeroFP)))),
(!cast<I>("V"#OpcPrefix#Zrr_Intkz)
VK1WM:$mask, _.VT:$src1,
(COPY_TO_REGCLASS _.FRC:$src2, VR128X))>;
}
}
defm : AVX512_scalar_math_f32_patterns<fadd, "ADD">;
defm : AVX512_scalar_math_f32_patterns<fsub, "SUB">;
defm : AVX512_scalar_math_f32_patterns<fmul, "MUL">;
defm : AVX512_scalar_math_f32_patterns<fdiv, "DIV">;
defm : AVX512_scalar_math_fp_patterns<fadd, "ADDSS", X86Movss, v4f32x_info, fp32imm0>;
defm : AVX512_scalar_math_fp_patterns<fsub, "SUBSS", X86Movss, v4f32x_info, fp32imm0>;
defm : AVX512_scalar_math_fp_patterns<fmul, "MULSS", X86Movss, v4f32x_info, fp32imm0>;
defm : AVX512_scalar_math_fp_patterns<fdiv, "DIVSS", X86Movss, v4f32x_info, fp32imm0>;
multiclass AVX512_scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
let Predicates = [HasAVX512] in {
// extracted scalar math op with insert via movsd
def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), (v2f64 (scalar_to_vector
(Op (f64 (extractelt (v2f64 VR128X:$dst), (iPTR 0))),
FR64X:$src))))),
(!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst,
(COPY_TO_REGCLASS FR64X:$src, VR128X))>;
defm : AVX512_scalar_math_fp_patterns<fadd, "ADDSD", X86Movsd, v2f64x_info, fp64imm0>;
defm : AVX512_scalar_math_fp_patterns<fsub, "SUBSD", X86Movsd, v2f64x_info, fp64imm0>;
defm : AVX512_scalar_math_fp_patterns<fmul, "MULSD", X86Movsd, v2f64x_info, fp64imm0>;
defm : AVX512_scalar_math_fp_patterns<fdiv, "DIVSD", X86Movsd, v2f64x_info, fp64imm0>;
// vector math op with insert via movsd
def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst),
(Op (v2f64 VR128X:$dst), (v2f64 VR128X:$src)))),
(!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, v2f64:$src)>;
// extracted masked scalar math op with insert via movss
def : Pat<(X86Movsd (v2f64 VR128X:$src1),
(scalar_to_vector
(X86selects VK1WM:$mask,
(Op (f64 (extractelt (v2f64 VR128X:$src1), (iPTR 0))),
FR64X:$src2),
FR64X:$src0))),
(!cast<I>("V"#OpcPrefix#SDZrr_Intk) (COPY_TO_REGCLASS FR64X:$src0, VR128X),
VK1WM:$mask, v2f64:$src1,
(COPY_TO_REGCLASS FR64X:$src2, VR128X))>;
}
}
defm : AVX512_scalar_math_f64_patterns<fadd, "ADD">;
defm : AVX512_scalar_math_f64_patterns<fsub, "SUB">;
defm : AVX512_scalar_math_f64_patterns<fmul, "MUL">;
defm : AVX512_scalar_math_f64_patterns<fdiv, "DIV">;
//===----------------------------------------------------------------------===//
// AES instructions

View File

@ -364,6 +364,28 @@ defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86Fnmadds1, X86Fnmadd,
defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86Fnmsubs1, X86Fnmsub,
SchedWriteFMA.Scl>, VEX_LIG;
multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix, SDNode Move,
ValueType VT, ValueType EltVT> {
let Predicates = [HasFMA, NoAVX512] in {
def : Pat<(VT (Move (VT VR128:$src2), (VT (scalar_to_vector
(Op (EltVT (extractelt (VT VR128:$src1), (iPTR 0))),
(EltVT (extractelt (VT VR128:$src2), (iPTR 0))),
(EltVT (extractelt (VT VR128:$src3), (iPTR 0)))))))),
(!cast<I>(Prefix#"213"#Suffix#"r_Int")
VR128:$src2, VR128:$src1, VR128:$src3)>;
}
}
defm : scalar_fma_patterns<X86Fmadd, "VFMADD", "SS", X86Movss, v4f32, f32>;
defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32>;
defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32>;
defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32>;
defm : scalar_fma_patterns<X86Fmadd, "VFMADD", "SD", X86Movsd, v2f64, f64>;
defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64>;
defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64>;
defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64>;
//===----------------------------------------------------------------------===//
// FMA4 - AMD 4 operand Fused Multiply-Add instructions
//===----------------------------------------------------------------------===//

View File

@ -2680,77 +2680,48 @@ let isCodeGenOnly = 1 in {
// TODO: Some canonicalization in lowering would simplify the number of
// patterns we have to try to match.
multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
let Predicates = [UseSSE1] in {
// extracted scalar math op with insert via movss
def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
(Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
FR32:$src))))),
(!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
(COPY_TO_REGCLASS FR32:$src, VR128))>;
multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
ValueType VT, ValueType EltTy,
RegisterClass RC, Predicate BasePredicate> {
let Predicates = [BasePredicate] in {
// extracted scalar math op with insert via movss/movsd
def : Pat<(VT (Move (VT VR128:$dst), (VT (scalar_to_vector
(Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
RC:$src))))),
(!cast<I>(OpcPrefix#rr_Int) VT:$dst,
(COPY_TO_REGCLASS RC:$src, VR128))>;
// vector math op with insert via movss
def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
(Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
(!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
// vector math op with insert via movss/movsd
def : Pat<(VT (Move (VT VR128:$dst),
(Op (VT VR128:$dst), (VT VR128:$src)))),
(!cast<I>(OpcPrefix#rr_Int) VT:$dst, VT:$src)>;
}
// Repeat everything for AVX.
// Repeat for AVX versions of the instructions.
let Predicates = [UseAVX] in {
// extracted scalar math op with insert via movss
def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
(Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
FR32:$src))))),
(!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
(COPY_TO_REGCLASS FR32:$src, VR128))>;
// extracted scalar math op with insert via movss/movsd
def : Pat<(VT (Move (VT VR128:$dst), (VT (scalar_to_vector
(Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))),
RC:$src))))),
(!cast<I>("V"#OpcPrefix#rr_Int) VT:$dst,
(COPY_TO_REGCLASS RC:$src, VR128))>;
// vector math op with insert via movss
def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
(Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
(!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
// vector math op with insert via movss/movsd
def : Pat<(VT (Move (VT VR128:$dst),
(Op (VT VR128:$dst), (VT VR128:$src)))),
(!cast<I>("V"#OpcPrefix#rr_Int) VT:$dst, VT:$src)>;
}
}
defm : scalar_math_f32_patterns<fadd, "ADD">;
defm : scalar_math_f32_patterns<fsub, "SUB">;
defm : scalar_math_f32_patterns<fmul, "MUL">;
defm : scalar_math_f32_patterns<fdiv, "DIV">;
defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, UseSSE1>;
multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
let Predicates = [UseSSE2] in {
// extracted scalar math op with insert via movsd
def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
(Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
FR64:$src))))),
(!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
(COPY_TO_REGCLASS FR64:$src, VR128))>;
// vector math op with insert via movsd
def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
(Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
(!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
}
// Repeat everything for AVX.
let Predicates = [UseAVX] in {
// extracted scalar math op with insert via movsd
def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
(Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
FR64:$src))))),
(!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
(COPY_TO_REGCLASS FR64:$src, VR128))>;
// vector math op with insert via movsd
def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
(Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
(!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
}
}
defm : scalar_math_f64_patterns<fadd, "ADD">;
defm : scalar_math_f64_patterns<fsub, "SUB">;
defm : scalar_math_f64_patterns<fmul, "MUL">;
defm : scalar_math_f64_patterns<fdiv, "DIV">;
defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, UseSSE2>;
/// Unop Arithmetic
/// In addition, we also have a special variant of the scalar form here to
@ -2980,11 +2951,40 @@ defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>,
// There is no f64 version of the reciprocal approximation instructions.
// TODO: We should add *scalar* op patterns for these just like we have for
// the binops above. If the binop and unop patterns could all be unified
// that would be even better.
multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
ValueType VT, Predicate BasePredicate> {
let Predicates = [BasePredicate] in {
def : Pat<(VT (Move VT:$dst, (scalar_to_vector
(OpNode (extractelt VT:$src, 0))))),
(!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
}
multiclass scalar_unary_math_patterns<Intrinsic Intr, string OpcPrefix,
// Repeat for AVX versions of the instructions.
let Predicates = [HasAVX] in {
def : Pat<(VT (Move VT:$dst, (scalar_to_vector
(OpNode (extractelt VT:$src, 0))))),
(!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
}
}
multiclass scalar_unary_math_imm_patterns<SDNode OpNode, string OpcPrefix, SDNode Move,
ValueType VT, bits<8> ImmV,
Predicate BasePredicate> {
let Predicates = [BasePredicate] in {
def : Pat<(VT (Move VT:$dst, (scalar_to_vector
(OpNode (extractelt VT:$src, 0))))),
(!cast<Ii8>(OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>;
}
// Repeat for AVX versions of the instructions.
let Predicates = [HasAVX] in {
def : Pat<(VT (Move VT:$dst, (scalar_to_vector
(OpNode (extractelt VT:$src, 0))))),
(!cast<Ii8>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>;
}
}
multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
SDNode Move, ValueType VT,
Predicate BasePredicate> {
let Predicates = [BasePredicate] in {
@ -2999,13 +2999,13 @@ multiclass scalar_unary_math_patterns<Intrinsic Intr, string OpcPrefix,
}
}
defm : scalar_unary_math_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
defm : scalar_unary_math_intr_patterns<int_x86_sse_rcp_ss, "RCPSS", X86Movss,
v4f32, UseSSE1>;
defm : scalar_unary_math_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
defm : scalar_unary_math_intr_patterns<int_x86_sse_rsqrt_ss, "RSQRTSS", X86Movss,
v4f32, UseSSE1>;
defm : scalar_unary_math_patterns<int_x86_sse_sqrt_ss, "SQRTSS", X86Movss,
defm : scalar_unary_math_intr_patterns<int_x86_sse_sqrt_ss, "SQRTSS", X86Movss,
v4f32, UseSSE1>;
defm : scalar_unary_math_patterns<int_x86_sse2_sqrt_sd, "SQRTSD", X86Movsd,
defm : scalar_unary_math_intr_patterns<int_x86_sse2_sqrt_sd, "SQRTSD", X86Movsd,
v2f64, UseSSE2>;

View File

@ -4,12 +4,8 @@
define <4 x float> @select_mask_add_ss(<4 x float> %w, i8 zeroext %u, <4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: select_mask_add_ss:
; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vaddss %xmm2, %xmm1, %xmm2
; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: sete %al
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vmovss %xmm0, %xmm1, %xmm2 {%k1}
; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddss %xmm2, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
entry:
%0 = extractelement <4 x float> %b, i32 0
@ -26,13 +22,8 @@ entry:
define <4 x float> @select_maskz_add_ss(i8 zeroext %u, <4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: select_maskz_add_ss:
; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm1
; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: sete %al
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vmovss %xmm2, %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
entry:
%0 = extractelement <4 x float> %b, i32 0
@ -48,12 +39,8 @@ entry:
define <4 x float> @select_mask_sub_ss(<4 x float> %w, i8 zeroext %u, <4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: select_mask_sub_ss:
; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vsubss %xmm2, %xmm1, %xmm2
; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: sete %al
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vmovss %xmm0, %xmm1, %xmm2 {%k1}
; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vsubss %xmm2, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
entry:
%0 = extractelement <4 x float> %b, i32 0
@ -70,13 +57,8 @@ entry:
define <4 x float> @select_maskz_sub_ss(i8 zeroext %u, <4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: select_maskz_sub_ss:
; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm1
; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: sete %al
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vmovss %xmm2, %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
entry:
%0 = extractelement <4 x float> %b, i32 0
@ -92,12 +74,8 @@ entry:
define <4 x float> @select_mask_mul_ss(<4 x float> %w, i8 zeroext %u, <4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: select_mask_mul_ss:
; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm2
; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: sete %al
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vmovss %xmm0, %xmm1, %xmm2 {%k1}
; CHECK-NEXT: vmovaps %xmm2, %xmm0
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm0 {%k1}
; CHECK-NEXT: retq
entry:
%0 = extractelement <4 x float> %b, i32 0
@ -114,13 +92,8 @@ entry:
define <4 x float> @select_maskz_mul_ss(i8 zeroext %u, <4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: select_maskz_mul_ss:
; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm1
; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: sete %al
; CHECK-NEXT: kmovw %eax, %k1
; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2
; CHECK-NEXT: vmovss %xmm2, %xmm0, %xmm1 {%k1}
; CHECK-NEXT: vmovaps %xmm1, %xmm0
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0 {%k1} {z}
; CHECK-NEXT: retq
entry:
%0 = extractelement <4 x float> %b, i32 0

View File

@ -4,10 +4,9 @@
define <2 x double> @combine_scalar_mask_fmadd_f32(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: combine_scalar_mask_fmadd_f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xca]
; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xc1]
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xa9,0xc2]
; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
%0 = bitcast <2 x double> %a to <4 x float>
@ -29,10 +28,9 @@ entry:
define <2 x double> @combine_scalar_mask_fmadd_f64(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: combine_scalar_mask_fmadd_f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xca]
; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xc1]
; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0xa9,0xc2]
; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
%0 = extractelement <2 x double> %a, i64 0
@ -50,10 +48,9 @@ entry:
define <2 x double> @combine_scalar_maskz_fmadd_32(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: combine_scalar_maskz_fmadd_32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xca]
; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x10,0xc1]
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa9,0xc2]
; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
%0 = bitcast <2 x double> %a to <4 x float>
@ -75,10 +72,9 @@ entry:
define <2 x double> @combine_scalar_maskz_fmadd_64(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: combine_scalar_maskz_fmadd_64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xca]
; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x10,0xc1]
; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa9,0xc2]
; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
%0 = extractelement <2 x double> %a, i64 0
@ -96,10 +92,9 @@ entry:
define <2 x double> @combine_scalar_mask3_fmadd_32(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) {
; CHECK-LABEL: combine_scalar_mask3_fmadd_32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xca]
; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xd1]
; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb9,0xd1]
; CHECK-NEXT: # xmm2 = (xmm0 * xmm1) + xmm2
; CHECK-NEXT: vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
@ -114,7 +109,7 @@ entry:
%8 = bitcast i8 %k to <8 x i1>
%9 = extractelement <8 x i1> %8, i64 0
%10 = select i1 %9, float %7, float %5
%11 = insertelement <4 x float> %0, float %10, i64 0
%11 = insertelement <4 x float> %2, float %10, i64 0
%12 = bitcast <4 x float> %11 to <2 x double>
ret <2 x double> %12
}
@ -122,10 +117,9 @@ entry:
define <2 x double> @combine_scalar_mask3_fmadd_64(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) {
; CHECK-LABEL: combine_scalar_mask3_fmadd_64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xca]
; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xd1]
; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb9,0xd1]
; CHECK-NEXT: # xmm2 = (xmm0 * xmm1) + xmm2
; CHECK-NEXT: vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2]
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
@ -137,17 +131,16 @@ entry:
%5 = bitcast i8 %k to <8 x i1>
%6 = extractelement <8 x i1> %5, i64 0
%7 = select i1 %6, double %4, double %2
%8 = insertelement <2 x double> %a, double %7, i64 0
%8 = insertelement <2 x double> %c, double %7, i64 0
ret <2 x double> %8
}
define <2 x double> @combine_scalar_mask_fmsub_f32(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: combine_scalar_mask_fmsub_f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xab,0xca]
; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xc1]
; CHECK-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xab,0xc2]
; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
%0 = bitcast <2 x double> %a to <4 x float>
@ -169,10 +162,9 @@ entry:
define <2 x double> @combine_scalar_mask_fmsub_f64(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: combine_scalar_mask_fmsub_f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xab,0xca]
; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xc1]
; CHECK-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0xab,0xc2]
; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
%0 = extractelement <2 x double> %a, i64 0
@ -190,10 +182,9 @@ entry:
define <2 x double> @combine_scalar_maskz_fmsub_32(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: combine_scalar_maskz_fmsub_32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xab,0xca]
; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x10,0xc1]
; CHECK-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xab,0xc2]
; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
%0 = bitcast <2 x double> %a to <4 x float>
@ -215,10 +206,9 @@ entry:
define <2 x double> @combine_scalar_maskz_fmsub_64(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: combine_scalar_maskz_fmsub_64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xab,0xca]
; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x10,0xc1]
; CHECK-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xab,0xc2]
; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
%0 = extractelement <2 x double> %a, i64 0
@ -236,10 +226,9 @@ entry:
define <2 x double> @combine_scalar_mask3_fmsub_32(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) {
; CHECK-LABEL: combine_scalar_mask3_fmsub_32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xab,0xca]
; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xd1]
; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbb,0xd1]
; CHECK-NEXT: # xmm2 = (xmm0 * xmm1) - xmm2
; CHECK-NEXT: vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
@ -254,7 +243,7 @@ entry:
%8 = bitcast i8 %k to <8 x i1>
%9 = extractelement <8 x i1> %8, i64 0
%10 = select i1 %9, float %7, float %5
%11 = insertelement <4 x float> %0, float %10, i64 0
%11 = insertelement <4 x float> %2, float %10, i64 0
%12 = bitcast <4 x float> %11 to <2 x double>
ret <2 x double> %12
}
@ -262,10 +251,9 @@ entry:
define <2 x double> @combine_scalar_mask3_fmsub_64(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) {
; CHECK-LABEL: combine_scalar_mask3_fmsub_64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xab,0xca]
; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xd1]
; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbb,0xd1]
; CHECK-NEXT: # xmm2 = (xmm0 * xmm1) - xmm2
; CHECK-NEXT: vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2]
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
@ -277,17 +265,16 @@ entry:
%5 = bitcast i8 %k to <8 x i1>
%6 = extractelement <8 x i1> %5, i64 0
%7 = select i1 %6, double %4, double %2
%8 = insertelement <2 x double> %a, double %7, i64 0
%8 = insertelement <2 x double> %c, double %7, i64 0
ret <2 x double> %8
}
define <2 x double> @combine_scalar_mask_fnmadd_f32(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: combine_scalar_mask_fnmadd_f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xad,0xca]
; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xc1]
; CHECK-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xad,0xc2]
; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
%0 = bitcast <2 x double> %a to <4 x float>
@ -309,10 +296,9 @@ entry:
define <2 x double> @combine_scalar_mask_fnmadd_f64(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: combine_scalar_mask_fnmadd_f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xad,0xca]
; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xc1]
; CHECK-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0xad,0xc2]
; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
%0 = extractelement <2 x double> %a, i64 0
@ -330,10 +316,9 @@ entry:
define <2 x double> @combine_scalar_maskz_fnmadd_32(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: combine_scalar_maskz_fnmadd_32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xad,0xca]
; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x10,0xc1]
; CHECK-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xad,0xc2]
; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
%0 = bitcast <2 x double> %a to <4 x float>
@ -355,10 +340,9 @@ entry:
define <2 x double> @combine_scalar_maskz_fnmadd_64(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: combine_scalar_maskz_fnmadd_64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xad,0xca]
; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x10,0xc1]
; CHECK-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xad,0xc2]
; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
%0 = extractelement <2 x double> %a, i64 0
@ -376,10 +360,9 @@ entry:
define <2 x double> @combine_scalar_mask3_fnmadd_32(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) {
; CHECK-LABEL: combine_scalar_mask3_fnmadd_32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xad,0xca]
; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xd1]
; CHECK-NEXT: vfnmadd231ss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbd,0xd1]
; CHECK-NEXT: # xmm2 = -(xmm0 * xmm1) + xmm2
; CHECK-NEXT: vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
@ -394,7 +377,7 @@ entry:
%8 = bitcast i8 %k to <8 x i1>
%9 = extractelement <8 x i1> %8, i64 0
%10 = select i1 %9, float %7, float %5
%11 = insertelement <4 x float> %0, float %10, i64 0
%11 = insertelement <4 x float> %2, float %10, i64 0
%12 = bitcast <4 x float> %11 to <2 x double>
ret <2 x double> %12
}
@ -402,10 +385,9 @@ entry:
define <2 x double> @combine_scalar_mask3_fnmadd_64(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) {
; CHECK-LABEL: combine_scalar_mask3_fnmadd_64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xad,0xca]
; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xd1]
; CHECK-NEXT: vfnmadd231sd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbd,0xd1]
; CHECK-NEXT: # xmm2 = -(xmm0 * xmm1) + xmm2
; CHECK-NEXT: vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2]
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
@ -417,17 +399,16 @@ entry:
%5 = bitcast i8 %k to <8 x i1>
%6 = extractelement <8 x i1> %5, i64 0
%7 = select i1 %6, double %4, double %2
%8 = insertelement <2 x double> %a, double %7, i64 0
%8 = insertelement <2 x double> %c, double %7, i64 0
ret <2 x double> %8
}
define <2 x double> @combine_scalar_mask_fnmsub_f32(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: combine_scalar_mask_fnmsub_f32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaf,0xca]
; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xc1]
; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xaf,0xc2]
; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
%0 = bitcast <2 x double> %a to <4 x float>
@ -450,10 +431,9 @@ entry:
define <2 x double> @combine_scalar_mask_fnmsub_f64(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: combine_scalar_mask_fnmsub_f64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaf,0xca]
; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xc1]
; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0xaf,0xc2]
; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
%0 = extractelement <2 x double> %a, i64 0
@ -472,10 +452,9 @@ entry:
define <2 x double> @combine_scalar_maskz_fnmsub_32(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: combine_scalar_maskz_fnmsub_32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaf,0xca]
; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x10,0xc1]
; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xaf,0xc2]
; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
%0 = bitcast <2 x double> %a to <4 x float>
@ -498,10 +477,9 @@ entry:
define <2 x double> @combine_scalar_maskz_fnmsub_64(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) {
; CHECK-LABEL: combine_scalar_maskz_fnmsub_64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaf,0xca]
; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x10,0xc1]
; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xaf,0xc2]
; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
%0 = extractelement <2 x double> %a, i64 0
@ -520,10 +498,9 @@ entry:
define <2 x double> @combine_scalar_mask3_fnmsub_32(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) {
; CHECK-LABEL: combine_scalar_mask3_fnmsub_32:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaf,0xca]
; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xd1]
; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbf,0xd1]
; CHECK-NEXT: # xmm2 = -(xmm0 * xmm1) - xmm2
; CHECK-NEXT: vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
@ -539,7 +516,7 @@ entry:
%8 = bitcast i8 %k to <8 x i1>
%9 = extractelement <8 x i1> %8, i64 0
%10 = select i1 %9, float %7, float %5
%11 = insertelement <4 x float> %0, float %10, i64 0
%11 = insertelement <4 x float> %2, float %10, i64 0
%12 = bitcast <4 x float> %11 to <2 x double>
ret <2 x double> %12
}
@ -547,10 +524,9 @@ entry:
define <2 x double> @combine_scalar_mask3_fnmsub_64(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) {
; CHECK-LABEL: combine_scalar_mask3_fnmsub_64:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaf,0xca]
; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2
; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf]
; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xd1]
; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbf,0xd1]
; CHECK-NEXT: # xmm2 = -(xmm0 * xmm1) - xmm2
; CHECK-NEXT: vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2]
; CHECK-NEXT: retq # encoding: [0xc3]
entry:
@ -563,6 +539,6 @@ entry:
%5 = bitcast i8 %k to <8 x i1>
%6 = extractelement <8 x i1> %5, i64 0
%7 = select i1 %6, double %4, double %2
%8 = insertelement <2 x double> %a, double %7, i64 0
%8 = insertelement <2 x double> %c, double %7, i64 0
ret <2 x double> %8
}