From 94fafebe3a2a622d247ca4e31d5be5171570ec3b Mon Sep 17 00:00:00 2001 From: Alexander Ivchenko Date: Tue, 29 May 2018 14:27:11 +0000 Subject: [PATCH] [X86] Scalar mask and scalar move optimizations 1. Introduction of mask scalar TableGen patterns. 2. Introduction of new scalar move TableGen patterns and refactoring of existing ones. 3. Folding of pattern created by introducing scalar masking in Clang header files. Patch by tkrupa Differential Revision: https://reviews.llvm.org/D47012 llvm-svn: 333419 --- lib/Target/X86/X86ISelLowering.cpp | 21 +++ lib/Target/X86/X86InstrAVX512.td | 178 ++++++++++++++++++------- lib/Target/X86/X86InstrFMA.td | 22 +++ lib/Target/X86/X86InstrSSE.td | 160 +++++++++++----------- test/CodeGen/X86/combine-select.ll | 51 ++----- test/CodeGen/X86/fma-scalar-combine.ll | 136 ++++++++----------- 6 files changed, 319 insertions(+), 249 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 38649b8fdf0..c106b9b53db 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -32133,6 +32133,27 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); } + // Some mask scalar intrinsics rely on checking if only one bit is set + // and implement it in C code like this: + // A[0] = (U & 1) ? A[0] : W[0]; + // This creates some redundant instructions that break pattern matching. + // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y) + if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT && + Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) { + ISD::CondCode CC = cast(Cond.getOperand(2))->get(); + SDValue AndNode = Cond.getOperand(0); + if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ && + isNullConstant(Cond.getOperand(1)) && + isa(AndNode.getOperand(1)) && + cast(AndNode.getOperand(1))->getAPIntValue() == 1) { + // LHS and RHS swapped due to + // setcc outputting 1 when AND resulted in 0 and vice versa. + if (AndNode.getValueType() != MVT::i8) + AndNode = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, AndNode); + return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS); + } + } + // v16i8 (select v16i1, v16i8, v16i8) does not have a proper // lowering on KNL. In this case we convert it to // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 3bf097bd79c..2b65106518c 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -6697,6 +6697,63 @@ defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86Fnmadds1, defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86Fnmsubs1, X86FnmsubRnds1, X86Fnmsubs3, X86FnmsubRnds3>; +multiclass avx512_scalar_fma_patterns { + let Predicates = [HasFMA, HasAVX512] in { + def : Pat<(VT (Move (VT VR128:$src2), (VT (scalar_to_vector + (Op (EltVT (extractelt (VT VR128:$src1), (iPTR 0))), + (EltVT (extractelt (VT VR128:$src2), (iPTR 0))), + (EltVT (extractelt (VT VR128:$src3), (iPTR 0)))))))), + (!cast(Prefix#"213"#Suffix#"Zr_Int") + VR128:$src2, VR128:$src1, VR128:$src3)>; + + def : Pat<(VT (Move (VT VR128:$src2), (VT (scalar_to_vector + (X86selects VK1WM:$mask, + (Op (EltVT (extractelt (VT VR128:$src1), (iPTR 0))), + (EltVT (extractelt (VT VR128:$src2), (iPTR 0))), + (EltVT (extractelt (VT VR128:$src3), (iPTR 0)))), + (EltVT (extractelt (VT VR128:$src2), (iPTR 0)))))))), + (!cast(Prefix#"213"#Suffix#"Zr_Intk") + VR128:$src2, VK1WM:$mask, VR128:$src1, VR128:$src3)>; + + def : Pat<(VT (Move (VT VR128:$src3), (VT (scalar_to_vector + (X86selects VK1WM:$mask, + (Op (EltVT (extractelt (VT VR128:$src1), (iPTR 0))), + (EltVT (extractelt (VT VR128:$src2), (iPTR 0))), + (EltVT (extractelt (VT VR128:$src3), (iPTR 0)))), + (EltVT (extractelt (VT VR128:$src3), (iPTR 0)))))))), + (!cast(Prefix#"231"#Suffix#"Zr_Intk") + VR128:$src3, VK1WM:$mask, VR128:$src2, VR128:$src1)>; + + def : Pat<(VT (Move (VT VR128:$src2), (VT (scalar_to_vector + (X86selects VK1WM:$mask, + (Op (EltVT (extractelt (VT VR128:$src1), (iPTR 0))), + (EltVT (extractelt (VT VR128:$src2), (iPTR 0))), + (EltVT (extractelt (VT VR128:$src3), (iPTR 0)))), + (EltVT ZeroFP)))))), + (!cast(Prefix#"213"#Suffix#"Zr_Intkz") + VR128:$src2, VK1WM:$mask, VR128:$src1, VR128:$src3)>; + } +} + +defm : avx512_scalar_fma_patterns; +defm : avx512_scalar_fma_patterns; +defm : avx512_scalar_fma_patterns; +defm : avx512_scalar_fma_patterns; + +defm : avx512_scalar_fma_patterns; +defm : avx512_scalar_fma_patterns; +defm : avx512_scalar_fma_patterns; +defm : avx512_scalar_fma_patterns; + //===----------------------------------------------------------------------===// // AVX-512 Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA //===----------------------------------------------------------------------===// @@ -8499,6 +8556,42 @@ defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", VEX_W, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VT1>; +multiclass avx512_masked_scalar { + let Predicates = [BasePredicate] in { + def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask, + (OpNode (extractelt _.VT:$src2, (iPTR 0))), + (extractelt _.VT:$dst, (iPTR 0))))), + (!cast("V"#OpcPrefix#r_Intk) + _.VT:$dst, OutMask, _.VT:$src2, _.VT:$src1)>; + + def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask, + (OpNode (extractelt _.VT:$src2, (iPTR 0))), + ZeroFP))), + (!cast("V"#OpcPrefix#r_Intkz) + OutMask, _.VT:$src2, _.VT:$src1)>; + } +} + +multiclass avx512_masked_scalar_imm ImmV, dag OutMask, + Predicate BasePredicate> { + let Predicates = [BasePredicate] in { + def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask, + (OpNode (extractelt _.VT:$src2, (iPTR 0))), + (extractelt _.VT:$dst, (iPTR 0))))), + (!cast("V"#OpcPrefix#r_Intk) + _.VT:$dst, OutMask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>; + + def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask, + (OpNode (extractelt _.VT:$src2, (iPTR 0))), ZeroFP))), + (!cast("V"#OpcPrefix#r_Intkz) + OutMask, _.VT:$src1, _.VT:$src2, (i32 ImmV))>; + } +} + //------------------------------------------------- // Integer truncate and extend operations //------------------------------------------------- @@ -10847,69 +10940,54 @@ defm VFIXUPIMMPD : avx512_fixupimm_packed_all // TODO: Some canonicalization in lowering would simplify the number of // patterns we have to try to match. -multiclass AVX512_scalar_math_f32_patterns { +multiclass AVX512_scalar_math_fp_patterns { let Predicates = [HasAVX512] in { // extracted scalar math op with insert via movss - def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), (v4f32 (scalar_to_vector - (Op (f32 (extractelt (v4f32 VR128X:$dst), (iPTR 0))), - FR32X:$src))))), - (!cast("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, - (COPY_TO_REGCLASS FR32X:$src, VR128X))>; + def : Pat<(_.VT (MoveNode (_.VT VR128X:$dst), (_.VT (scalar_to_vector + (Op (_.EltVT (extractelt (_.VT VR128X:$dst), (iPTR 0))), + _.FRC:$src))))), + (!cast("V"#OpcPrefix#Zrr_Int) _.VT:$dst, + (COPY_TO_REGCLASS _.FRC:$src, VR128X))>; // vector math op with insert via movss - def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), - (Op (v4f32 VR128X:$dst), (v4f32 VR128X:$src)))), - (!cast("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, v4f32:$src)>; + def : Pat<(_.VT (MoveNode (_.VT VR128X:$dst), + (Op (_.VT VR128X:$dst), (_.VT VR128X:$src)))), + (!cast("V"#OpcPrefix#Zrr_Int) _.VT:$dst, _.VT:$src)>; // extracted masked scalar math op with insert via movss - def : Pat<(X86Movss (v4f32 VR128X:$src1), + def : Pat<(MoveNode (_.VT VR128X:$src1), (scalar_to_vector (X86selects VK1WM:$mask, - (Op (f32 (extractelt (v4f32 VR128X:$src1), (iPTR 0))), - FR32X:$src2), - FR32X:$src0))), - (!cast("V"#OpcPrefix#SSZrr_Intk) (COPY_TO_REGCLASS FR32X:$src0, VR128X), - VK1WM:$mask, v4f32:$src1, - (COPY_TO_REGCLASS FR32X:$src2, VR128X))>; + (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + _.FRC:$src2), + _.FRC:$src0))), + (!cast("V"#OpcPrefix#Zrr_Intk) (COPY_TO_REGCLASS _.FRC:$src0, VR128X), + VK1WM:$mask, _.VT:$src1, + (COPY_TO_REGCLASS _.FRC:$src2, VR128X))>; + + // extracted masked scalar math op with insert via movss + def : Pat<(MoveNode (_.VT VR128X:$src1), + (scalar_to_vector + (X86selects VK1WM:$mask, + (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))), + _.FRC:$src2), (_.EltVT ZeroFP)))), + (!cast("V"#OpcPrefix#Zrr_Intkz) + VK1WM:$mask, _.VT:$src1, + (COPY_TO_REGCLASS _.FRC:$src2, VR128X))>; } } -defm : AVX512_scalar_math_f32_patterns; -defm : AVX512_scalar_math_f32_patterns; -defm : AVX512_scalar_math_f32_patterns; -defm : AVX512_scalar_math_f32_patterns; +defm : AVX512_scalar_math_fp_patterns; +defm : AVX512_scalar_math_fp_patterns; +defm : AVX512_scalar_math_fp_patterns; +defm : AVX512_scalar_math_fp_patterns; -multiclass AVX512_scalar_math_f64_patterns { - let Predicates = [HasAVX512] in { - // extracted scalar math op with insert via movsd - def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), (v2f64 (scalar_to_vector - (Op (f64 (extractelt (v2f64 VR128X:$dst), (iPTR 0))), - FR64X:$src))))), - (!cast("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, - (COPY_TO_REGCLASS FR64X:$src, VR128X))>; +defm : AVX512_scalar_math_fp_patterns; +defm : AVX512_scalar_math_fp_patterns; +defm : AVX512_scalar_math_fp_patterns; +defm : AVX512_scalar_math_fp_patterns; - // vector math op with insert via movsd - def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), - (Op (v2f64 VR128X:$dst), (v2f64 VR128X:$src)))), - (!cast("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, v2f64:$src)>; - - // extracted masked scalar math op with insert via movss - def : Pat<(X86Movsd (v2f64 VR128X:$src1), - (scalar_to_vector - (X86selects VK1WM:$mask, - (Op (f64 (extractelt (v2f64 VR128X:$src1), (iPTR 0))), - FR64X:$src2), - FR64X:$src0))), - (!cast("V"#OpcPrefix#SDZrr_Intk) (COPY_TO_REGCLASS FR64X:$src0, VR128X), - VK1WM:$mask, v2f64:$src1, - (COPY_TO_REGCLASS FR64X:$src2, VR128X))>; - } -} - -defm : AVX512_scalar_math_f64_patterns; -defm : AVX512_scalar_math_f64_patterns; -defm : AVX512_scalar_math_f64_patterns; -defm : AVX512_scalar_math_f64_patterns; //===----------------------------------------------------------------------===// // AES instructions diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td index c979f6cbc56..c106b825f57 100644 --- a/lib/Target/X86/X86InstrFMA.td +++ b/lib/Target/X86/X86InstrFMA.td @@ -364,6 +364,28 @@ defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86Fnmadds1, X86Fnmadd, defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86Fnmsubs1, X86Fnmsub, SchedWriteFMA.Scl>, VEX_LIG; +multiclass scalar_fma_patterns { + let Predicates = [HasFMA, NoAVX512] in { + def : Pat<(VT (Move (VT VR128:$src2), (VT (scalar_to_vector + (Op (EltVT (extractelt (VT VR128:$src1), (iPTR 0))), + (EltVT (extractelt (VT VR128:$src2), (iPTR 0))), + (EltVT (extractelt (VT VR128:$src3), (iPTR 0)))))))), + (!cast(Prefix#"213"#Suffix#"r_Int") + VR128:$src2, VR128:$src1, VR128:$src3)>; + } +} + +defm : scalar_fma_patterns; +defm : scalar_fma_patterns; +defm : scalar_fma_patterns; +defm : scalar_fma_patterns; + +defm : scalar_fma_patterns; +defm : scalar_fma_patterns; +defm : scalar_fma_patterns; +defm : scalar_fma_patterns; + //===----------------------------------------------------------------------===// // FMA4 - AMD 4 operand Fused Multiply-Add instructions //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index eecf155153d..b392f016691 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -2680,78 +2680,49 @@ let isCodeGenOnly = 1 in { // TODO: Some canonicalization in lowering would simplify the number of // patterns we have to try to match. -multiclass scalar_math_f32_patterns { - let Predicates = [UseSSE1] in { - // extracted scalar math op with insert via movss - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))))), - (!cast(OpcPrefix#SSrr_Int) v4f32:$dst, - (COPY_TO_REGCLASS FR32:$src, VR128))>; +multiclass scalar_math_patterns { + let Predicates = [BasePredicate] in { + // extracted scalar math op with insert via movss/movsd + def : Pat<(VT (Move (VT VR128:$dst), (VT (scalar_to_vector + (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), + RC:$src))))), + (!cast(OpcPrefix#rr_Int) VT:$dst, + (COPY_TO_REGCLASS RC:$src, VR128))>; - // vector math op with insert via movss - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (!cast(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; - } + // vector math op with insert via movss/movsd + def : Pat<(VT (Move (VT VR128:$dst), + (Op (VT VR128:$dst), (VT VR128:$src)))), + (!cast(OpcPrefix#rr_Int) VT:$dst, VT:$src)>; + } - // Repeat everything for AVX. - let Predicates = [UseAVX] in { - // extracted scalar math op with insert via movss - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))))), - (!cast("V"#OpcPrefix#SSrr_Int) v4f32:$dst, - (COPY_TO_REGCLASS FR32:$src, VR128))>; + // Repeat for AVX versions of the instructions. + let Predicates = [UseAVX] in { + // extracted scalar math op with insert via movss/movsd + def : Pat<(VT (Move (VT VR128:$dst), (VT (scalar_to_vector + (Op (EltTy (extractelt (VT VR128:$dst), (iPTR 0))), + RC:$src))))), + (!cast("V"#OpcPrefix#rr_Int) VT:$dst, + (COPY_TO_REGCLASS RC:$src, VR128))>; - // vector math op with insert via movss - def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), - (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))), - (!cast("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; - } + // vector math op with insert via movss/movsd + def : Pat<(VT (Move (VT VR128:$dst), + (Op (VT VR128:$dst), (VT VR128:$src)))), + (!cast("V"#OpcPrefix#rr_Int) VT:$dst, VT:$src)>; + } } -defm : scalar_math_f32_patterns; -defm : scalar_math_f32_patterns; -defm : scalar_math_f32_patterns; -defm : scalar_math_f32_patterns; - -multiclass scalar_math_f64_patterns { - let Predicates = [UseSSE2] in { - // extracted scalar math op with insert via movsd - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector - (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (!cast(OpcPrefix#SDrr_Int) v2f64:$dst, - (COPY_TO_REGCLASS FR64:$src, VR128))>; - - // vector math op with insert via movsd - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (!cast(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; - } - - // Repeat everything for AVX. - let Predicates = [UseAVX] in { - // extracted scalar math op with insert via movsd - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector - (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))))), - (!cast("V"#OpcPrefix#SDrr_Int) v2f64:$dst, - (COPY_TO_REGCLASS FR64:$src, VR128))>; - - // vector math op with insert via movsd - def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), - (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))), - (!cast("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; - } -} - -defm : scalar_math_f64_patterns; -defm : scalar_math_f64_patterns; -defm : scalar_math_f64_patterns; -defm : scalar_math_f64_patterns; +defm : scalar_math_patterns; +defm : scalar_math_patterns; +defm : scalar_math_patterns; +defm : scalar_math_patterns; +defm : scalar_math_patterns; +defm : scalar_math_patterns; +defm : scalar_math_patterns; +defm : scalar_math_patterns; + /// Unop Arithmetic /// In addition, we also have a special variant of the scalar form here to /// represent the associated intrinsic operation. This form is unlike the @@ -2980,13 +2951,42 @@ defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SchedWriteFRcp, HasAVX>, // There is no f64 version of the reciprocal approximation instructions. -// TODO: We should add *scalar* op patterns for these just like we have for -// the binops above. If the binop and unop patterns could all be unified -// that would be even better. +multiclass scalar_unary_math_patterns { + let Predicates = [BasePredicate] in { + def : Pat<(VT (Move VT:$dst, (scalar_to_vector + (OpNode (extractelt VT:$src, 0))))), + (!cast(OpcPrefix#r_Int) VT:$dst, VT:$src)>; + } -multiclass scalar_unary_math_patterns { + // Repeat for AVX versions of the instructions. + let Predicates = [HasAVX] in { + def : Pat<(VT (Move VT:$dst, (scalar_to_vector + (OpNode (extractelt VT:$src, 0))))), + (!cast("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; + } +} + +multiclass scalar_unary_math_imm_patterns ImmV, + Predicate BasePredicate> { + let Predicates = [BasePredicate] in { + def : Pat<(VT (Move VT:$dst, (scalar_to_vector + (OpNode (extractelt VT:$src, 0))))), + (!cast(OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>; + } + + // Repeat for AVX versions of the instructions. + let Predicates = [HasAVX] in { + def : Pat<(VT (Move VT:$dst, (scalar_to_vector + (OpNode (extractelt VT:$src, 0))))), + (!cast("V"#OpcPrefix#r_Int) VT:$dst, VT:$src, (i32 ImmV))>; + } +} + +multiclass scalar_unary_math_intr_patterns { let Predicates = [BasePredicate] in { def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), (!cast(OpcPrefix#r_Int) VT:$dst, VT:$src)>; @@ -2999,14 +2999,14 @@ multiclass scalar_unary_math_patterns; -defm : scalar_unary_math_patterns; -defm : scalar_unary_math_patterns; -defm : scalar_unary_math_patterns; +defm : scalar_unary_math_intr_patterns; +defm : scalar_unary_math_intr_patterns; +defm : scalar_unary_math_intr_patterns; +defm : scalar_unary_math_intr_patterns; //===----------------------------------------------------------------------===// diff --git a/test/CodeGen/X86/combine-select.ll b/test/CodeGen/X86/combine-select.ll index 92f7a16b7c7..58be308f505 100644 --- a/test/CodeGen/X86/combine-select.ll +++ b/test/CodeGen/X86/combine-select.ll @@ -4,12 +4,8 @@ define <4 x float> @select_mask_add_ss(<4 x float> %w, i8 zeroext %u, <4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: select_mask_add_ss: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vaddss %xmm2, %xmm1, %xmm2 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: sete %al -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovss %xmm0, %xmm1, %xmm2 {%k1} -; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vaddss %xmm2, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq entry: %0 = extractelement <4 x float> %b, i32 0 @@ -26,13 +22,8 @@ entry: define <4 x float> @select_maskz_add_ss(i8 zeroext %u, <4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: select_maskz_add_ss: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm1 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: sete %al -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vmovss %xmm2, %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq entry: %0 = extractelement <4 x float> %b, i32 0 @@ -48,12 +39,8 @@ entry: define <4 x float> @select_mask_sub_ss(<4 x float> %w, i8 zeroext %u, <4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: select_mask_sub_ss: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vsubss %xmm2, %xmm1, %xmm2 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: sete %al -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovss %xmm0, %xmm1, %xmm2 {%k1} -; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vsubss %xmm2, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq entry: %0 = extractelement <4 x float> %b, i32 0 @@ -70,13 +57,8 @@ entry: define <4 x float> @select_maskz_sub_ss(i8 zeroext %u, <4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: select_maskz_sub_ss: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm1 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: sete %al -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vmovss %xmm2, %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vsubss %xmm1, %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq entry: %0 = extractelement <4 x float> %b, i32 0 @@ -92,12 +74,8 @@ entry: define <4 x float> @select_mask_mul_ss(<4 x float> %w, i8 zeroext %u, <4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: select_mask_mul_ss: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm2 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: sete %al -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vmovss %xmm0, %xmm1, %xmm2 {%k1} -; CHECK-NEXT: vmovaps %xmm2, %xmm0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmulss %xmm2, %xmm1, %xmm0 {%k1} ; CHECK-NEXT: retq entry: %0 = extractelement <4 x float> %b, i32 0 @@ -114,13 +92,8 @@ entry: define <4 x float> @select_maskz_mul_ss(i8 zeroext %u, <4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: select_maskz_mul_ss: ; CHECK: ## %bb.0: ## %entry -; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm1 -; CHECK-NEXT: testb $1, %dil -; CHECK-NEXT: sete %al -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; CHECK-NEXT: vmovss %xmm2, %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vmovaps %xmm1, %xmm0 +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vmulss %xmm1, %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: retq entry: %0 = extractelement <4 x float> %b, i32 0 diff --git a/test/CodeGen/X86/fma-scalar-combine.ll b/test/CodeGen/X86/fma-scalar-combine.ll index 97c812d9013..188408fb7ae 100644 --- a/test/CodeGen/X86/fma-scalar-combine.ll +++ b/test/CodeGen/X86/fma-scalar-combine.ll @@ -4,10 +4,9 @@ define <2 x double> @combine_scalar_mask_fmadd_f32(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_mask_fmadd_f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xc1] +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xa9,0xc2] +; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = bitcast <2 x double> %a to <4 x float> @@ -29,10 +28,9 @@ entry: define <2 x double> @combine_scalar_mask_fmadd_f64(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_mask_fmadd_f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xc1] +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0xa9,0xc2] +; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = extractelement <2 x double> %a, i64 0 @@ -50,10 +48,9 @@ entry: define <2 x double> @combine_scalar_maskz_fmadd_32(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_maskz_fmadd_32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x10,0xc1] +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xa9,0xc2] +; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = bitcast <2 x double> %a to <4 x float> @@ -75,10 +72,9 @@ entry: define <2 x double> @combine_scalar_maskz_fmadd_64(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_maskz_fmadd_64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x10,0xc1] +; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xa9,0xc2] +; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = extractelement <2 x double> %a, i64 0 @@ -96,10 +92,9 @@ entry: define <2 x double> @combine_scalar_mask3_fmadd_32(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) { ; CHECK-LABEL: combine_scalar_mask3_fmadd_32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xd1] +; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xb9,0xd1] +; CHECK-NEXT: # xmm2 = (xmm0 * xmm1) + xmm2 ; CHECK-NEXT: vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2] ; CHECK-NEXT: retq # encoding: [0xc3] entry: @@ -114,7 +109,7 @@ entry: %8 = bitcast i8 %k to <8 x i1> %9 = extractelement <8 x i1> %8, i64 0 %10 = select i1 %9, float %7, float %5 - %11 = insertelement <4 x float> %0, float %10, i64 0 + %11 = insertelement <4 x float> %2, float %10, i64 0 %12 = bitcast <4 x float> %11 to <2 x double> ret <2 x double> %12 } @@ -122,10 +117,9 @@ entry: define <2 x double> @combine_scalar_mask3_fmadd_64(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) { ; CHECK-LABEL: combine_scalar_mask3_fmadd_64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xd1] +; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xb9,0xd1] +; CHECK-NEXT: # xmm2 = (xmm0 * xmm1) + xmm2 ; CHECK-NEXT: vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2] ; CHECK-NEXT: retq # encoding: [0xc3] entry: @@ -137,17 +131,16 @@ entry: %5 = bitcast i8 %k to <8 x i1> %6 = extractelement <8 x i1> %5, i64 0 %7 = select i1 %6, double %4, double %2 - %8 = insertelement <2 x double> %a, double %7, i64 0 + %8 = insertelement <2 x double> %c, double %7, i64 0 ret <2 x double> %8 } define <2 x double> @combine_scalar_mask_fmsub_f32(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_mask_fmsub_f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xab,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xc1] +; CHECK-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xab,0xc2] +; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = bitcast <2 x double> %a to <4 x float> @@ -169,10 +162,9 @@ entry: define <2 x double> @combine_scalar_mask_fmsub_f64(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_mask_fmsub_f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xab,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xc1] +; CHECK-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0xab,0xc2] +; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = extractelement <2 x double> %a, i64 0 @@ -190,10 +182,9 @@ entry: define <2 x double> @combine_scalar_maskz_fmsub_32(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_maskz_fmsub_32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xab,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x10,0xc1] +; CHECK-NEXT: vfmsub213ss %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xab,0xc2] +; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = bitcast <2 x double> %a to <4 x float> @@ -215,10 +206,9 @@ entry: define <2 x double> @combine_scalar_maskz_fmsub_64(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_maskz_fmsub_64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xab,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x10,0xc1] +; CHECK-NEXT: vfmsub213sd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xab,0xc2] +; CHECK-NEXT: # xmm0 = (xmm1 * xmm0) - xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = extractelement <2 x double> %a, i64 0 @@ -236,10 +226,9 @@ entry: define <2 x double> @combine_scalar_mask3_fmsub_32(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) { ; CHECK-LABEL: combine_scalar_mask3_fmsub_32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xab,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xd1] +; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbb,0xd1] +; CHECK-NEXT: # xmm2 = (xmm0 * xmm1) - xmm2 ; CHECK-NEXT: vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2] ; CHECK-NEXT: retq # encoding: [0xc3] entry: @@ -254,7 +243,7 @@ entry: %8 = bitcast i8 %k to <8 x i1> %9 = extractelement <8 x i1> %8, i64 0 %10 = select i1 %9, float %7, float %5 - %11 = insertelement <4 x float> %0, float %10, i64 0 + %11 = insertelement <4 x float> %2, float %10, i64 0 %12 = bitcast <4 x float> %11 to <2 x double> ret <2 x double> %12 } @@ -262,10 +251,9 @@ entry: define <2 x double> @combine_scalar_mask3_fmsub_64(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) { ; CHECK-LABEL: combine_scalar_mask3_fmsub_64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xab,0xca] -; CHECK-NEXT: # xmm1 = (xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xd1] +; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbb,0xd1] +; CHECK-NEXT: # xmm2 = (xmm0 * xmm1) - xmm2 ; CHECK-NEXT: vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2] ; CHECK-NEXT: retq # encoding: [0xc3] entry: @@ -277,17 +265,16 @@ entry: %5 = bitcast i8 %k to <8 x i1> %6 = extractelement <8 x i1> %5, i64 0 %7 = select i1 %6, double %4, double %2 - %8 = insertelement <2 x double> %a, double %7, i64 0 + %8 = insertelement <2 x double> %c, double %7, i64 0 ret <2 x double> %8 } define <2 x double> @combine_scalar_mask_fnmadd_f32(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_mask_fnmadd_f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xad,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xc1] +; CHECK-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xad,0xc2] +; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = bitcast <2 x double> %a to <4 x float> @@ -309,10 +296,9 @@ entry: define <2 x double> @combine_scalar_mask_fnmadd_f64(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_mask_fnmadd_f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xad,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xc1] +; CHECK-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0xad,0xc2] +; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = extractelement <2 x double> %a, i64 0 @@ -330,10 +316,9 @@ entry: define <2 x double> @combine_scalar_maskz_fnmadd_32(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_maskz_fnmadd_32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xad,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x10,0xc1] +; CHECK-NEXT: vfnmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xad,0xc2] +; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = bitcast <2 x double> %a to <4 x float> @@ -355,10 +340,9 @@ entry: define <2 x double> @combine_scalar_maskz_fnmadd_64(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_maskz_fnmadd_64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xad,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x10,0xc1] +; CHECK-NEXT: vfnmadd213sd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xad,0xc2] +; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) + xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = extractelement <2 x double> %a, i64 0 @@ -376,10 +360,9 @@ entry: define <2 x double> @combine_scalar_mask3_fnmadd_32(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) { ; CHECK-LABEL: combine_scalar_mask3_fnmadd_32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmadd213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xad,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xd1] +; CHECK-NEXT: vfnmadd231ss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbd,0xd1] +; CHECK-NEXT: # xmm2 = -(xmm0 * xmm1) + xmm2 ; CHECK-NEXT: vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2] ; CHECK-NEXT: retq # encoding: [0xc3] entry: @@ -394,7 +377,7 @@ entry: %8 = bitcast i8 %k to <8 x i1> %9 = extractelement <8 x i1> %8, i64 0 %10 = select i1 %9, float %7, float %5 - %11 = insertelement <4 x float> %0, float %10, i64 0 + %11 = insertelement <4 x float> %2, float %10, i64 0 %12 = bitcast <4 x float> %11 to <2 x double> ret <2 x double> %12 } @@ -402,10 +385,9 @@ entry: define <2 x double> @combine_scalar_mask3_fnmadd_64(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) { ; CHECK-LABEL: combine_scalar_mask3_fnmadd_64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmadd213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xad,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) + xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xd1] +; CHECK-NEXT: vfnmadd231sd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbd,0xd1] +; CHECK-NEXT: # xmm2 = -(xmm0 * xmm1) + xmm2 ; CHECK-NEXT: vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2] ; CHECK-NEXT: retq # encoding: [0xc3] entry: @@ -417,17 +399,16 @@ entry: %5 = bitcast i8 %k to <8 x i1> %6 = extractelement <8 x i1> %5, i64 0 %7 = select i1 %6, double %4, double %2 - %8 = insertelement <2 x double> %a, double %7, i64 0 + %8 = insertelement <2 x double> %c, double %7, i64 0 ret <2 x double> %8 } define <2 x double> @combine_scalar_mask_fnmsub_f32(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_mask_fnmsub_f32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaf,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xc1] +; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0x75,0x09,0xaf,0xc2] +; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = bitcast <2 x double> %a to <4 x float> @@ -450,10 +431,9 @@ entry: define <2 x double> @combine_scalar_mask_fnmsub_f64(<2 x double> %a, i8 zeroext %k, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_mask_fnmsub_f64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaf,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xc1] +; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x09,0xaf,0xc2] +; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = extractelement <2 x double> %a, i64 0 @@ -472,10 +452,9 @@ entry: define <2 x double> @combine_scalar_maskz_fnmsub_32(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_maskz_fnmsub_32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaf,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7e,0x89,0x10,0xc1] +; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x75,0x89,0xaf,0xc2] +; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = bitcast <2 x double> %a to <4 x float> @@ -498,10 +477,9 @@ entry: define <2 x double> @combine_scalar_maskz_fnmsub_64(i8 zeroext %k, <2 x double> %a, <2 x double> %b, <2 x double> %c) { ; CHECK-LABEL: combine_scalar_maskz_fnmsub_64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaf,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xff,0x89,0x10,0xc1] +; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0x89,0xaf,0xc2] +; CHECK-NEXT: # xmm0 = -(xmm1 * xmm0) - xmm2 ; CHECK-NEXT: retq # encoding: [0xc3] entry: %0 = extractelement <2 x double> %a, i64 0 @@ -520,10 +498,9 @@ entry: define <2 x double> @combine_scalar_mask3_fnmsub_32(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) { ; CHECK-LABEL: combine_scalar_mask3_fnmsub_32: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmsub213ss %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaf,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7e,0x09,0x10,0xd1] +; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x09,0xbf,0xd1] +; CHECK-NEXT: # xmm2 = -(xmm0 * xmm1) - xmm2 ; CHECK-NEXT: vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2] ; CHECK-NEXT: retq # encoding: [0xc3] entry: @@ -539,7 +516,7 @@ entry: %8 = bitcast i8 %k to <8 x i1> %9 = extractelement <8 x i1> %8, i64 0 %10 = select i1 %9, float %7, float %5 - %11 = insertelement <4 x float> %0, float %10, i64 0 + %11 = insertelement <4 x float> %2, float %10, i64 0 %12 = bitcast <4 x float> %11 to <2 x double> ret <2 x double> %12 } @@ -547,10 +524,9 @@ entry: define <2 x double> @combine_scalar_mask3_fnmsub_64(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 zeroext %k) { ; CHECK-LABEL: combine_scalar_mask3_fnmsub_64: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: vfnmsub213sd %xmm2, %xmm0, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaf,0xca] -; CHECK-NEXT: # xmm1 = -(xmm0 * xmm1) - xmm2 ; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0xff,0x09,0x10,0xd1] +; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0xbf,0xd1] +; CHECK-NEXT: # xmm2 = -(xmm0 * xmm1) - xmm2 ; CHECK-NEXT: vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2] ; CHECK-NEXT: retq # encoding: [0xc3] entry: @@ -563,6 +539,6 @@ entry: %5 = bitcast i8 %k to <8 x i1> %6 = extractelement <8 x i1> %5, i64 0 %7 = select i1 %6, double %4, double %2 - %8 = insertelement <2 x double> %a, double %7, i64 0 + %8 = insertelement <2 x double> %c, double %7, i64 0 ret <2 x double> %8 }