1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 19:23:23 +01:00

[X86] Add combination for fma and fneg on X86 under strict FP.

Summary: X86 has instructions to calculate fma and fneg at the same time. But we combine the fneg and fma only when fneg is the source operand under strict FP.

Reviewers: craig.topper, andrew.w.kaylor, uweigand, RKSimon, LiuChen3

Subscribers: LuoYuanke, llvm-commits, cfe-commits, jdoerfert, hiraditya

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D72824
This commit is contained in:
Wang, Pengfei 2020-01-15 19:08:38 +08:00
parent b93b2eeaa3
commit 335158c8c4
6 changed files with 1010 additions and 80 deletions

View File

@ -2000,6 +2000,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FSUB);
setTargetDAGCombine(ISD::FNEG); setTargetDAGCombine(ISD::FNEG);
setTargetDAGCombine(ISD::FMA); setTargetDAGCombine(ISD::FMA);
setTargetDAGCombine(ISD::STRICT_FMA);
setTargetDAGCombine(ISD::FMINNUM); setTargetDAGCombine(ISD::FMINNUM);
setTargetDAGCombine(ISD::FMAXNUM); setTargetDAGCombine(ISD::FMAXNUM);
setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::SUB);
@ -29850,8 +29851,11 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(VPCOMU) NODE_NAME_CASE(VPCOMU)
NODE_NAME_CASE(VPERMIL2) NODE_NAME_CASE(VPERMIL2)
NODE_NAME_CASE(FMSUB) NODE_NAME_CASE(FMSUB)
NODE_NAME_CASE(STRICT_FMSUB)
NODE_NAME_CASE(FNMADD) NODE_NAME_CASE(FNMADD)
NODE_NAME_CASE(STRICT_FNMADD)
NODE_NAME_CASE(FNMSUB) NODE_NAME_CASE(FNMSUB)
NODE_NAME_CASE(STRICT_FNMSUB)
NODE_NAME_CASE(FMADDSUB) NODE_NAME_CASE(FMADDSUB)
NODE_NAME_CASE(FMSUBADD) NODE_NAME_CASE(FMSUBADD)
NODE_NAME_CASE(FMADD_RND) NODE_NAME_CASE(FMADD_RND)
@ -42715,37 +42719,46 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
if (NegMul) { if (NegMul) {
switch (Opcode) { switch (Opcode) {
default: llvm_unreachable("Unexpected opcode"); default: llvm_unreachable("Unexpected opcode");
case ISD::FMA: Opcode = X86ISD::FNMADD; break; case ISD::FMA: Opcode = X86ISD::FNMADD; break;
case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break; case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break;
case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break; case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break;
case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break; case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break;
case X86ISD::FNMADD: Opcode = ISD::FMA; break; case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break;
case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break; case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break;
case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break; case X86ISD::FNMADD: Opcode = ISD::FMA; break;
case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break; case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break;
case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break;
case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break;
case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break;
case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break;
} }
} }
if (NegAcc) { if (NegAcc) {
switch (Opcode) { switch (Opcode) {
default: llvm_unreachable("Unexpected opcode"); default: llvm_unreachable("Unexpected opcode");
case ISD::FMA: Opcode = X86ISD::FMSUB; break; case ISD::FMA: Opcode = X86ISD::FMSUB; break;
case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break; case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break;
case X86ISD::FMSUB: Opcode = ISD::FMA; break; case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break;
case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break; case X86ISD::FMSUB: Opcode = ISD::FMA; break;
case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break; case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break;
case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break; case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break;
case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break; case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break;
case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break; case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break; case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break; case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break;
case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break; case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break; case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break;
case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break;
case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break;
case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
} }
} }
if (NegRes) { if (NegRes) {
switch (Opcode) { switch (Opcode) {
// For accuracy reason, we never combine fneg and fma under strict FP.
default: llvm_unreachable("Unexpected opcode"); default: llvm_unreachable("Unexpected opcode");
case ISD::FMA: Opcode = X86ISD::FNMSUB; break; case ISD::FMA: Opcode = X86ISD::FNMSUB; break;
case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break; case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break;
@ -43716,6 +43729,7 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) { const X86Subtarget &Subtarget) {
SDLoc dl(N); SDLoc dl(N);
EVT VT = N->getValueType(0); EVT VT = N->getValueType(0);
bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
// Let legalize expand this if it isn't a legal type yet. // Let legalize expand this if it isn't a legal type yet.
const TargetLowering &TLI = DAG.getTargetLoweringInfo(); const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@ -43726,9 +43740,9 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA()) if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
return SDValue(); return SDValue();
SDValue A = N->getOperand(0); SDValue A = N->getOperand(IsStrict ? 1 : 0);
SDValue B = N->getOperand(1); SDValue B = N->getOperand(IsStrict ? 2 : 1);
SDValue C = N->getOperand(2); SDValue C = N->getOperand(IsStrict ? 3 : 2);
auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) { auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize(); bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
@ -43766,9 +43780,15 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
unsigned NewOpcode = unsigned NewOpcode =
negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false); negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
if (N->getNumOperands() == 4) if (IsStrict) {
return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3)); assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
return DAG.getNode(NewOpcode, dl, VT, A, B, C); return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
{N->getOperand(0), A, B, C});
} else {
if (N->getNumOperands() == 4)
return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
return DAG.getNode(NewOpcode, dl, VT, A, B, C);
}
} }
// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C) // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
@ -46269,12 +46289,16 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget); case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
case X86ISD::FMADD_RND: case X86ISD::FMADD_RND:
case X86ISD::FMSUB: case X86ISD::FMSUB:
case X86ISD::STRICT_FMSUB:
case X86ISD::FMSUB_RND: case X86ISD::FMSUB_RND:
case X86ISD::FNMADD: case X86ISD::FNMADD:
case X86ISD::STRICT_FNMADD:
case X86ISD::FNMADD_RND: case X86ISD::FNMADD_RND:
case X86ISD::FNMSUB: case X86ISD::FNMSUB:
case X86ISD::STRICT_FNMSUB:
case X86ISD::FNMSUB_RND: case X86ISD::FNMSUB_RND:
case ISD::FMA: return combineFMA(N, DAG, DCI, Subtarget); case ISD::FMA:
case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget);
case X86ISD::FMADDSUB_RND: case X86ISD::FMADDSUB_RND:
case X86ISD::FMSUBADD_RND: case X86ISD::FMSUBADD_RND:
case X86ISD::FMADDSUB: case X86ISD::FMADDSUB:

View File

@ -626,6 +626,9 @@ namespace llvm {
// Vector signed/unsigned integer to float/double. // Vector signed/unsigned integer to float/double.
STRICT_CVTSI2P, STRICT_CVTUI2P, STRICT_CVTSI2P, STRICT_CVTUI2P,
// Strict FMA nodes.
STRICT_FNMADD, STRICT_FMSUB, STRICT_FNMSUB,
// Compare and swap. // Compare and swap.
LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE, LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
LCMPXCHG8_DAG, LCMPXCHG8_DAG,

View File

@ -6487,11 +6487,11 @@ multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
} }
defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86any_Fmadd, X86FmaddRnd>; defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86any_Fmadd, X86FmaddRnd>;
defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86Fmsub, X86FmsubRnd>; defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86any_Fmsub, X86FmsubRnd>;
defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub, X86FmaddsubRnd>; defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub, X86FmaddsubRnd>;
defm VFMSUBADD213 : avx512_fma3p_213_f<0xA7, "vfmsubadd213", X86Fmsubadd, X86FmsubaddRnd>; defm VFMSUBADD213 : avx512_fma3p_213_f<0xA7, "vfmsubadd213", X86Fmsubadd, X86FmsubaddRnd>;
defm VFNMADD213 : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86Fnmadd, X86FnmaddRnd>; defm VFNMADD213 : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86any_Fnmadd, X86FnmaddRnd>;
defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubRnd>; defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86any_Fnmsub, X86FnmsubRnd>;
multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
@ -6565,11 +6565,11 @@ multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
} }
defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86any_Fmadd, X86FmaddRnd>; defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86any_Fmadd, X86FmaddRnd>;
defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86Fmsub, X86FmsubRnd>; defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86any_Fmsub, X86FmsubRnd>;
defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub, X86FmaddsubRnd>; defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub, X86FmaddsubRnd>;
defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd, X86FmsubaddRnd>; defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd, X86FmsubaddRnd>;
defm VFNMADD231 : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86Fnmadd, X86FnmaddRnd>; defm VFNMADD231 : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86any_Fnmadd, X86FnmaddRnd>;
defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubRnd>; defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86any_Fnmsub, X86FnmsubRnd>;
multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86FoldableSchedWrite sched, X86FoldableSchedWrite sched,
@ -6645,11 +6645,11 @@ multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
} }
defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86any_Fmadd, X86FmaddRnd>; defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86any_Fmadd, X86FmaddRnd>;
defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86Fmsub, X86FmsubRnd>; defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86any_Fmsub, X86FmsubRnd>;
defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub, X86FmaddsubRnd>; defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub, X86FmaddsubRnd>;
defm VFMSUBADD132 : avx512_fma3p_132_f<0x97, "vfmsubadd132", X86Fmsubadd, X86FmsubaddRnd>; defm VFMSUBADD132 : avx512_fma3p_132_f<0x97, "vfmsubadd132", X86Fmsubadd, X86FmsubaddRnd>;
defm VFNMADD132 : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86Fnmadd, X86FnmaddRnd>; defm VFNMADD132 : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86any_Fnmadd, X86FnmaddRnd>;
defm VFNMSUB132 : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86Fnmsub, X86FnmsubRnd>; defm VFNMSUB132 : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86any_Fnmsub, X86FnmsubRnd>;
// Scalar FMA // Scalar FMA
multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
@ -6742,9 +6742,9 @@ multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
} }
defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86any_Fmadd, X86FmaddRnd>; defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86any_Fmadd, X86FmaddRnd>;
defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd>; defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86any_Fmsub, X86FmsubRnd>;
defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd>; defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86any_Fnmadd, X86FnmaddRnd>;
defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>; defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86any_Fnmsub, X86FnmsubRnd>;
multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix, multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
string Suffix, SDNode Move, string Suffix, SDNode Move,
@ -6950,20 +6950,20 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86FmaddRnd, "VFMADD", "SS", defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86FmaddRnd, "VFMADD", "SS",
X86Movss, v4f32x_info, fp32imm0>; X86Movss, v4f32x_info, fp32imm0>;
defm : avx512_scalar_fma_patterns<X86Fmsub, X86FmsubRnd, "VFMSUB", "SS", defm : avx512_scalar_fma_patterns<X86any_Fmsub, X86FmsubRnd, "VFMSUB", "SS",
X86Movss, v4f32x_info, fp32imm0>; X86Movss, v4f32x_info, fp32imm0>;
defm : avx512_scalar_fma_patterns<X86Fnmadd, X86FnmaddRnd, "VFNMADD", "SS", defm : avx512_scalar_fma_patterns<X86any_Fnmadd, X86FnmaddRnd, "VFNMADD", "SS",
X86Movss, v4f32x_info, fp32imm0>; X86Movss, v4f32x_info, fp32imm0>;
defm : avx512_scalar_fma_patterns<X86Fnmsub, X86FnmsubRnd, "VFNMSUB", "SS", defm : avx512_scalar_fma_patterns<X86any_Fnmsub, X86FnmsubRnd, "VFNMSUB", "SS",
X86Movss, v4f32x_info, fp32imm0>; X86Movss, v4f32x_info, fp32imm0>;
defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86FmaddRnd, "VFMADD", "SD", defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86FmaddRnd, "VFMADD", "SD",
X86Movsd, v2f64x_info, fp64imm0>; X86Movsd, v2f64x_info, fp64imm0>;
defm : avx512_scalar_fma_patterns<X86Fmsub, X86FmsubRnd, "VFMSUB", "SD", defm : avx512_scalar_fma_patterns<X86any_Fmsub, X86FmsubRnd, "VFMSUB", "SD",
X86Movsd, v2f64x_info, fp64imm0>; X86Movsd, v2f64x_info, fp64imm0>;
defm : avx512_scalar_fma_patterns<X86Fnmadd, X86FnmaddRnd, "VFNMADD", "SD", defm : avx512_scalar_fma_patterns<X86any_Fnmadd, X86FnmaddRnd, "VFNMADD", "SD",
X86Movsd, v2f64x_info, fp64imm0>; X86Movsd, v2f64x_info, fp64imm0>;
defm : avx512_scalar_fma_patterns<X86Fnmsub, X86FnmsubRnd, "VFNMSUB", "SD", defm : avx512_scalar_fma_patterns<X86any_Fnmsub, X86FnmsubRnd, "VFNMSUB", "SD",
X86Movsd, v2f64x_info, fp64imm0>; X86Movsd, v2f64x_info, fp64imm0>;
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//

View File

@ -126,7 +126,7 @@ let ExeDomain = SSEPackedSingle in {
loadv4f32, loadv8f32, X86any_Fmadd, v4f32, v8f32, loadv4f32, loadv8f32, X86any_Fmadd, v4f32, v8f32,
SchedWriteFMA>; SchedWriteFMA>;
defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS", defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS",
loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32, loadv4f32, loadv8f32, X86any_Fmsub, v4f32, v8f32,
SchedWriteFMA>; SchedWriteFMA>;
defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", "PS", defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", "PS",
loadv4f32, loadv8f32, X86Fmaddsub, v4f32, v8f32, loadv4f32, loadv8f32, X86Fmaddsub, v4f32, v8f32,
@ -141,7 +141,7 @@ let ExeDomain = SSEPackedDouble in {
loadv2f64, loadv4f64, X86any_Fmadd, v2f64, loadv2f64, loadv4f64, X86any_Fmadd, v2f64,
v4f64, SchedWriteFMA>, VEX_W; v4f64, SchedWriteFMA>, VEX_W;
defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD", defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD",
loadv2f64, loadv4f64, X86Fmsub, v2f64, loadv2f64, loadv4f64, X86any_Fmsub, v2f64,
v4f64, SchedWriteFMA>, VEX_W; v4f64, SchedWriteFMA>, VEX_W;
defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", "PD", defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", "PD",
loadv2f64, loadv4f64, X86Fmaddsub, loadv2f64, loadv4f64, X86Fmaddsub,
@ -154,15 +154,15 @@ let ExeDomain = SSEPackedDouble in {
// Fused Negative Multiply-Add // Fused Negative Multiply-Add
let ExeDomain = SSEPackedSingle in { let ExeDomain = SSEPackedSingle in {
defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", "PS", loadv4f32, defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", "PS", loadv4f32,
loadv8f32, X86Fnmadd, v4f32, v8f32, SchedWriteFMA>; loadv8f32, X86any_Fnmadd, v4f32, v8f32, SchedWriteFMA>;
defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", "PS", loadv4f32, defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", "PS", loadv4f32,
loadv8f32, X86Fnmsub, v4f32, v8f32, SchedWriteFMA>; loadv8f32, X86any_Fnmsub, v4f32, v8f32, SchedWriteFMA>;
} }
let ExeDomain = SSEPackedDouble in { let ExeDomain = SSEPackedDouble in {
defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", "PD", loadv2f64, defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", "PD", loadv2f64,
loadv4f64, X86Fnmadd, v2f64, v4f64, SchedWriteFMA>, VEX_W; loadv4f64, X86any_Fnmadd, v2f64, v4f64, SchedWriteFMA>, VEX_W;
defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", "PD", loadv2f64, defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", "PD", loadv2f64,
loadv4f64, X86Fnmsub, v2f64, v4f64, SchedWriteFMA>, VEX_W; loadv4f64, X86any_Fnmsub, v2f64, v4f64, SchedWriteFMA>, VEX_W;
} }
// All source register operands of FMA opcodes defined in fma3s_rm multiclass // All source register operands of FMA opcodes defined in fma3s_rm multiclass
@ -321,12 +321,12 @@ multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86any_Fmadd, defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86any_Fmadd,
SchedWriteFMA.Scl>, VEX_LIG; SchedWriteFMA.Scl>, VEX_LIG;
defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86Fmsub, defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86any_Fmsub,
SchedWriteFMA.Scl>, VEX_LIG; SchedWriteFMA.Scl>, VEX_LIG;
defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86Fnmadd, defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86any_Fnmadd,
SchedWriteFMA.Scl>, VEX_LIG; SchedWriteFMA.Scl>, VEX_LIG;
defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86Fnmsub, defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86any_Fnmsub,
SchedWriteFMA.Scl>, VEX_LIG; SchedWriteFMA.Scl>, VEX_LIG;
multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix, multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix,
@ -373,14 +373,14 @@ multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix,
} }
defm : scalar_fma_patterns<X86any_Fmadd, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>; defm : scalar_fma_patterns<X86any_Fmadd, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>; defm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>; defm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>; defm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
defm : scalar_fma_patterns<X86any_Fmadd, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; defm : scalar_fma_patterns<X86any_Fmadd, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; defm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; defm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>; defm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===//
// FMA4 - AMD 4 operand Fused Multiply-Add instructions // FMA4 - AMD 4 operand Fused Multiply-Add instructions
@ -542,26 +542,26 @@ let ExeDomain = SSEPackedSingle in {
SchedWriteFMA.Scl>, SchedWriteFMA.Scl>,
fma4s_int<0x6A, "vfmaddss", ssmem, v4f32, fma4s_int<0x6A, "vfmaddss", ssmem, v4f32,
SchedWriteFMA.Scl>; SchedWriteFMA.Scl>;
defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32, defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86any_Fmsub, loadf32,
SchedWriteFMA.Scl>, SchedWriteFMA.Scl>,
fma4s_int<0x6E, "vfmsubss", ssmem, v4f32, fma4s_int<0x6E, "vfmsubss", ssmem, v4f32,
SchedWriteFMA.Scl>; SchedWriteFMA.Scl>;
defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32, defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
X86Fnmadd, loadf32, SchedWriteFMA.Scl>, X86any_Fnmadd, loadf32, SchedWriteFMA.Scl>,
fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32, fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32,
SchedWriteFMA.Scl>; SchedWriteFMA.Scl>;
defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32, defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
X86Fnmsub, loadf32, SchedWriteFMA.Scl>, X86any_Fnmsub, loadf32, SchedWriteFMA.Scl>,
fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32, fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32,
SchedWriteFMA.Scl>; SchedWriteFMA.Scl>;
// Packed Instructions // Packed Instructions
defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86any_Fmadd, v4f32, v8f32, defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86any_Fmadd, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>; loadv4f32, loadv8f32, SchedWriteFMA>;
defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32, defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86any_Fmsub, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>; loadv4f32, loadv8f32, SchedWriteFMA>;
defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32, defm VFNMADDPS4 : fma4p<0x78, "vfnmaddps", X86any_Fnmadd, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>; loadv4f32, loadv8f32, SchedWriteFMA>;
defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32, defm VFNMSUBPS4 : fma4p<0x7C, "vfnmsubps", X86any_Fnmsub, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>; loadv4f32, loadv8f32, SchedWriteFMA>;
defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32, defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>; loadv4f32, loadv8f32, SchedWriteFMA>;
@ -575,26 +575,26 @@ let ExeDomain = SSEPackedDouble in {
SchedWriteFMA.Scl>, SchedWriteFMA.Scl>,
fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64, fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64,
SchedWriteFMA.Scl>; SchedWriteFMA.Scl>;
defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64, defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86any_Fmsub, loadf64,
SchedWriteFMA.Scl>, SchedWriteFMA.Scl>,
fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64, fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64,
SchedWriteFMA.Scl>; SchedWriteFMA.Scl>;
defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64, defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
X86Fnmadd, loadf64, SchedWriteFMA.Scl>, X86any_Fnmadd, loadf64, SchedWriteFMA.Scl>,
fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64, fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64,
SchedWriteFMA.Scl>; SchedWriteFMA.Scl>;
defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64, defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
X86Fnmsub, loadf64, SchedWriteFMA.Scl>, X86any_Fnmsub, loadf64, SchedWriteFMA.Scl>,
fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64, fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64,
SchedWriteFMA.Scl>; SchedWriteFMA.Scl>;
// Packed Instructions // Packed Instructions
defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86any_Fmadd, v2f64, v4f64, defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86any_Fmadd, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>; loadv2f64, loadv4f64, SchedWriteFMA>;
defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64, defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86any_Fmsub, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>; loadv2f64, loadv4f64, SchedWriteFMA>;
defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64, defm VFNMADDPD4 : fma4p<0x79, "vfnmaddpd", X86any_Fnmadd, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>; loadv2f64, loadv4f64, SchedWriteFMA>;
defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64, defm VFNMSUBPD4 : fma4p<0x7D, "vfnmsubpd", X86any_Fnmsub, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>; loadv2f64, loadv4f64, SchedWriteFMA>;
defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64, defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>; loadv2f64, loadv4f64, SchedWriteFMA>;
@ -630,11 +630,11 @@ multiclass scalar_fma4_patterns<SDNode Op, string Name,
} }
defm : scalar_fma4_patterns<X86any_Fmadd, "VFMADDSS4", v4f32, f32, FR32, loadf32>; defm : scalar_fma4_patterns<X86any_Fmadd, "VFMADDSS4", v4f32, f32, FR32, loadf32>;
defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>; defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>;
defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>; defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>;
defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>; defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>;
defm : scalar_fma4_patterns<X86any_Fmadd, "VFMADDSD4", v2f64, f64, FR64, loadf64>; defm : scalar_fma4_patterns<X86any_Fmadd, "VFMADDSD4", v2f64, f64, FR64, loadf64>;
defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>; defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>;
defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>; defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>;
defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>; defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>;

View File

@ -535,8 +535,20 @@ def X86any_Fmadd : PatFrags<(ops node:$src1, node:$src2, node:$src3),
[(X86strict_Fmadd node:$src1, node:$src2, node:$src3), [(X86strict_Fmadd node:$src1, node:$src2, node:$src3),
(X86Fmadd node:$src1, node:$src2, node:$src3)]>; (X86Fmadd node:$src1, node:$src2, node:$src3)]>;
def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFPTernaryOp, [SDNPCommutative]>; def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFPTernaryOp, [SDNPCommutative]>;
def X86strict_Fnmadd : SDNode<"X86ISD::STRICT_FNMADD", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>;
def X86any_Fnmadd : PatFrags<(ops node:$src1, node:$src2, node:$src3),
[(X86strict_Fnmadd node:$src1, node:$src2, node:$src3),
(X86Fnmadd node:$src1, node:$src2, node:$src3)]>;
def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFPTernaryOp, [SDNPCommutative]>; def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFPTernaryOp, [SDNPCommutative]>;
def X86strict_Fmsub : SDNode<"X86ISD::STRICT_FMSUB", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>;
def X86any_Fmsub : PatFrags<(ops node:$src1, node:$src2, node:$src3),
[(X86strict_Fmsub node:$src1, node:$src2, node:$src3),
(X86Fmsub node:$src1, node:$src2, node:$src3)]>;
def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFPTernaryOp, [SDNPCommutative]>; def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFPTernaryOp, [SDNPCommutative]>;
def X86strict_Fnmsub : SDNode<"X86ISD::STRICT_FNMSUB", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>;
def X86any_Fnmsub : PatFrags<(ops node:$src1, node:$src2, node:$src3),
[(X86strict_Fnmsub node:$src1, node:$src2, node:$src3),
(X86Fnmsub node:$src1, node:$src2, node:$src3)]>;
def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFPTernaryOp, [SDNPCommutative]>; def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFPTernaryOp, [SDNPCommutative]>;
def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFPTernaryOp, [SDNPCommutative]>; def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFPTernaryOp, [SDNPCommutative]>;

View File

@ -1,7 +1,326 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck %s --check-prefixes=COMMON,NOFMA ; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck %s --check-prefixes=COMMON,NOFMA
; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck %s --check-prefixes=COMMON,FMA ; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma < %s | FileCheck %s --check-prefixes=COMMON,FMA,FMA-AVX1
; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx512f < %s | FileCheck %s --check-prefixes=COMMON,FMA ; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+fma4 < %s | FileCheck %s --check-prefixes=COMMON,FMA4
; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx512f < %s | FileCheck %s --check-prefixes=COMMON,FMA,FMA-AVX512
define float @f1(float %0, float %1, float %2) #0 {
; NOFMA-LABEL: f1:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: pushq %rax
; NOFMA-NEXT: .cfi_def_cfa_offset 16
; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: popq %rax
; NOFMA-NEXT: .cfi_def_cfa_offset 8
; NOFMA-NEXT: retq
;
; FMA-LABEL: f1:
; FMA: # %bb.0: # %entry
; FMA-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
; FMA-NEXT: retq
;
; FMA4-LABEL: f1:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vfnmaddss %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
entry:
%3 = fneg float %0
%result = call float @llvm.experimental.constrained.fma.f32(float %3, float %1, float %2,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret float %result
}
define double @f2(double %0, double %1, double %2) #0 {
; NOFMA-LABEL: f2:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: pushq %rax
; NOFMA-NEXT: .cfi_def_cfa_offset 16
; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0
; NOFMA-NEXT: callq fma
; NOFMA-NEXT: popq %rax
; NOFMA-NEXT: .cfi_def_cfa_offset 8
; NOFMA-NEXT: retq
;
; FMA-LABEL: f2:
; FMA: # %bb.0: # %entry
; FMA-NEXT: vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
; FMA-NEXT: retq
;
; FMA4-LABEL: f2:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vfnmaddsd %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
entry:
%3 = fneg double %0
%result = call double @llvm.experimental.constrained.fma.f64(double %3, double %1, double %2,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret double %result
}
define float @f3(float %0, float %1, float %2) #0 {
; NOFMA-LABEL: f3:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: pushq %rax
; NOFMA-NEXT: .cfi_def_cfa_offset 16
; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm2
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: popq %rax
; NOFMA-NEXT: .cfi_def_cfa_offset 8
; NOFMA-NEXT: retq
;
; FMA-LABEL: f3:
; FMA: # %bb.0: # %entry
; FMA-NEXT: vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
; FMA-NEXT: retq
;
; FMA4-LABEL: f3:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vfmsubss %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
entry:
%3 = fneg float %2
%result = call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %3,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret float %result
}
define double @f4(double %0, double %1, double %2) #0 {
; NOFMA-LABEL: f4:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: pushq %rax
; NOFMA-NEXT: .cfi_def_cfa_offset 16
; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm2
; NOFMA-NEXT: callq fma
; NOFMA-NEXT: popq %rax
; NOFMA-NEXT: .cfi_def_cfa_offset 8
; NOFMA-NEXT: retq
;
; FMA-LABEL: f4:
; FMA: # %bb.0: # %entry
; FMA-NEXT: vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
; FMA-NEXT: retq
;
; FMA4-LABEL: f4:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vfmsubsd %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
entry:
%3 = fneg double %2
%result = call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %3,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret double %result
}
define float @f5(float %0, float %1, float %2) #0 {
; NOFMA-LABEL: f5:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: pushq %rax
; NOFMA-NEXT: .cfi_def_cfa_offset 16
; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; NOFMA-NEXT: xorps %xmm3, %xmm0
; NOFMA-NEXT: xorps %xmm3, %xmm2
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: popq %rax
; NOFMA-NEXT: .cfi_def_cfa_offset 8
; NOFMA-NEXT: retq
;
; FMA-LABEL: f5:
; FMA: # %bb.0: # %entry
; FMA-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
; FMA-NEXT: retq
;
; FMA4-LABEL: f5:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
entry:
%3 = fneg float %0
%4 = fneg float %2
%result = call float @llvm.experimental.constrained.fma.f32(float %3, float %1, float %4,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret float %result
}
define double @f6(double %0, double %1, double %2) #0 {
; NOFMA-LABEL: f6:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: pushq %rax
; NOFMA-NEXT: .cfi_def_cfa_offset 16
; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
; NOFMA-NEXT: xorps %xmm3, %xmm0
; NOFMA-NEXT: xorps %xmm3, %xmm2
; NOFMA-NEXT: callq fma
; NOFMA-NEXT: popq %rax
; NOFMA-NEXT: .cfi_def_cfa_offset 8
; NOFMA-NEXT: retq
;
; FMA-LABEL: f6:
; FMA: # %bb.0: # %entry
; FMA-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
; FMA-NEXT: retq
;
; FMA4-LABEL: f6:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
entry:
%3 = fneg double %0
%4 = fneg double %2
%result = call double @llvm.experimental.constrained.fma.f64(double %3, double %1, double %4,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret double %result
}
define float @f7(float %0, float %1, float %2) #0 {
; NOFMA-LABEL: f7:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: pushq %rax
; NOFMA-NEXT: .cfi_def_cfa_offset 16
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0
; NOFMA-NEXT: popq %rax
; NOFMA-NEXT: .cfi_def_cfa_offset 8
; NOFMA-NEXT: retq
;
; FMA-AVX1-LABEL: f7:
; FMA-AVX1: # %bb.0: # %entry
; FMA-AVX1-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
; FMA-AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
; FMA-AVX1-NEXT: retq
;
; FMA4-LABEL: f7:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vfmaddss %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
; FMA4-NEXT: retq
;
; FMA-AVX512-LABEL: f7:
; FMA-AVX512: # %bb.0: # %entry
; FMA-AVX512-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0
; FMA-AVX512-NEXT: retq
entry:
%3 = call float @llvm.experimental.constrained.fma.f32(float %0, float %1, float %2,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
%result = fneg float %3
ret float %result
}
define double @f8(double %0, double %1, double %2) #0 {
; NOFMA-LABEL: f8:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: pushq %rax
; NOFMA-NEXT: .cfi_def_cfa_offset 16
; NOFMA-NEXT: callq fma
; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0
; NOFMA-NEXT: popq %rax
; NOFMA-NEXT: .cfi_def_cfa_offset 8
; NOFMA-NEXT: retq
;
; FMA-LABEL: f8:
; FMA: # %bb.0: # %entry
; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
; FMA-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: f8:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
; FMA4-NEXT: retq
entry:
%3 = call double @llvm.experimental.constrained.fma.f64(double %0, double %1, double %2,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
%result = fneg double %3
ret double %result
}
define float @f9(float %0, float %1, float %2) #0 {
; NOFMA-LABEL: f9:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: pushq %rax
; NOFMA-NEXT: .cfi_def_cfa_offset 16
; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; NOFMA-NEXT: xorps %xmm3, %xmm0
; NOFMA-NEXT: xorps %xmm3, %xmm2
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0
; NOFMA-NEXT: popq %rax
; NOFMA-NEXT: .cfi_def_cfa_offset 8
; NOFMA-NEXT: retq
;
; FMA-AVX1-LABEL: f9:
; FMA-AVX1: # %bb.0: # %entry
; FMA-AVX1-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
; FMA-AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
; FMA-AVX1-NEXT: retq
;
; FMA4-LABEL: f9:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vfnmsubss %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
; FMA4-NEXT: retq
;
; FMA-AVX512-LABEL: f9:
; FMA-AVX512: # %bb.0: # %entry
; FMA-AVX512-NEXT: vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0
; FMA-AVX512-NEXT: retq
entry:
%3 = fneg float %0
%4 = fneg float %2
%5 = call float @llvm.experimental.constrained.fma.f32(float %3, float %1, float %4,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
%result = fneg float %5
ret float %result
}
define double @f10(double %0, double %1, double %2) #0 {
; NOFMA-LABEL: f10:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: pushq %rax
; NOFMA-NEXT: .cfi_def_cfa_offset 16
; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
; NOFMA-NEXT: xorps %xmm3, %xmm0
; NOFMA-NEXT: xorps %xmm3, %xmm2
; NOFMA-NEXT: callq fma
; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0
; NOFMA-NEXT: popq %rax
; NOFMA-NEXT: .cfi_def_cfa_offset 8
; NOFMA-NEXT: retq
;
; FMA-LABEL: f10:
; FMA: # %bb.0: # %entry
; FMA-NEXT: vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
; FMA-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: f10:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vfnmsubsd %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
; FMA4-NEXT: retq
entry:
%3 = fneg double %0
%4 = fneg double %2
%5 = call double @llvm.experimental.constrained.fma.f64(double %3, double %1, double %4,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
%result = fneg double %5
ret double %result
}
; Verify that fma(3.5) isn't simplified when the rounding mode is ; Verify that fma(3.5) isn't simplified when the rounding mode is
; unknown. ; unknown.
@ -23,6 +342,12 @@ define float @f17() #0 {
; FMA-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; FMA-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; FMA-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 ; FMA-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
; FMA-NEXT: retq ; FMA-NEXT: retq
;
; FMA4-LABEL: f17:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; FMA4-NEXT: vfmaddss %xmm0, %xmm0, %xmm0, %xmm0
; FMA4-NEXT: retq
entry: entry:
%result = call float @llvm.experimental.constrained.fma.f32( %result = call float @llvm.experimental.constrained.fma.f32(
float 3.5, float 3.5,
@ -53,6 +378,12 @@ define double @f18() #0 {
; FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; FMA-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0 ; FMA-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
; FMA-NEXT: retq ; FMA-NEXT: retq
;
; FMA4-LABEL: f18:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; FMA4-NEXT: vfmaddsd %xmm0, %xmm0, %xmm0, %xmm0
; FMA4-NEXT: retq
entry: entry:
%result = call double @llvm.experimental.constrained.fma.f64( %result = call double @llvm.experimental.constrained.fma.f64(
double 42.1, double 42.1,
@ -63,7 +394,567 @@ entry:
ret double %result ret double %result
} }
define <4 x float> @f19(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 {
; NOFMA-LABEL: f19:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: subq $88, %rsp
; NOFMA-NEXT: .cfi_def_cfa_offset 96
; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: pxor {{.*}}(%rip), %xmm0
; NOFMA-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
; NOFMA-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
; NOFMA-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm0 = mem[2,3,2,3]
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; NOFMA-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
; NOFMA-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm0 = mem[1,1,2,3]
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; NOFMA-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0]
; NOFMA-NEXT: movdqa %xmm1, %xmm0
; NOFMA-NEXT: addq $88, %rsp
; NOFMA-NEXT: .cfi_def_cfa_offset 8
; NOFMA-NEXT: retq
;
; FMA-LABEL: f19:
; FMA: # %bb.0: # %entry
; FMA-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
; FMA-NEXT: retq
;
; FMA4-LABEL: f19:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vfnmaddps %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
entry:
%3 = fneg <4 x float> %0
%result = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %3, <4 x float> %1, <4 x float> %2,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x float> %result
}
define <2 x double> @f20(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 {
; NOFMA-LABEL: f20:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: subq $72, %rsp
; NOFMA-NEXT: .cfi_def_cfa_offset 80
; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm0
; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: callq fma
; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload
; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm0 = mem[2,3,2,3]
; NOFMA-NEXT: callq fma
; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; NOFMA-NEXT: movdqa %xmm1, %xmm0
; NOFMA-NEXT: addq $72, %rsp
; NOFMA-NEXT: .cfi_def_cfa_offset 8
; NOFMA-NEXT: retq
;
; FMA-LABEL: f20:
; FMA: # %bb.0: # %entry
; FMA-NEXT: vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
; FMA-NEXT: retq
;
; FMA4-LABEL: f20:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vfnmaddpd %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
entry:
%3 = fneg <2 x double> %0
%result = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %3, <2 x double> %1, <2 x double> %2,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %result
}
define <4 x float> @f21(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 {
; NOFMA-LABEL: f21:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: subq $88, %rsp
; NOFMA-NEXT: .cfi_def_cfa_offset 96
; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: pxor {{.*}}(%rip), %xmm2
; NOFMA-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
; NOFMA-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm2 = mem[2,3,2,3]
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; NOFMA-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
; NOFMA-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm2 = mem[1,1,2,3]
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; NOFMA-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0]
; NOFMA-NEXT: movaps %xmm1, %xmm0
; NOFMA-NEXT: addq $88, %rsp
; NOFMA-NEXT: .cfi_def_cfa_offset 8
; NOFMA-NEXT: retq
;
; FMA-LABEL: f21:
; FMA: # %bb.0: # %entry
; FMA-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
; FMA-NEXT: retq
;
; FMA4-LABEL: f21:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vfmsubps %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
entry:
%3 = fneg <4 x float> %2
%result = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %3,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x float> %result
}
define <2 x double> @f22(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 {
; NOFMA-LABEL: f22:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: subq $72, %rsp
; NOFMA-NEXT: .cfi_def_cfa_offset 80
; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm2
; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: callq fma
; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm2 = mem[2,3,2,3]
; NOFMA-NEXT: callq fma
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; NOFMA-NEXT: movaps %xmm1, %xmm0
; NOFMA-NEXT: addq $72, %rsp
; NOFMA-NEXT: .cfi_def_cfa_offset 8
; NOFMA-NEXT: retq
;
; FMA-LABEL: f22:
; FMA: # %bb.0: # %entry
; FMA-NEXT: vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
; FMA-NEXT: retq
;
; FMA4-LABEL: f22:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vfmsubpd %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
entry:
%3 = fneg <2 x double> %2
%result = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %0, <2 x double> %1, <2 x double> %3,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %result
}
define <4 x float> @f23(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 {
; NOFMA-LABEL: f23:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: subq $88, %rsp
; NOFMA-NEXT: .cfi_def_cfa_offset 96
; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: movdqa {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; NOFMA-NEXT: pxor %xmm3, %xmm0
; NOFMA-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: pxor %xmm3, %xmm2
; NOFMA-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; NOFMA-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm0 = mem[2,3,2,3]
; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm2 = mem[2,3,2,3]
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm0 = mem[1,1,2,3]
; NOFMA-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm2 = mem[1,1,2,3]
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; NOFMA-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0]
; NOFMA-NEXT: movdqa %xmm1, %xmm0
; NOFMA-NEXT: addq $88, %rsp
; NOFMA-NEXT: .cfi_def_cfa_offset 8
; NOFMA-NEXT: retq
;
; FMA-LABEL: f23:
; FMA: # %bb.0: # %entry
; FMA-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
; FMA-NEXT: retq
;
; FMA4-LABEL: f23:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
entry:
%3 = fneg <4 x float> %0
%4 = fneg <4 x float> %2
%result = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %3, <4 x float> %1, <4 x float> %4,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <4 x float> %result
}
define <2 x double> @f24(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 {
; NOFMA-LABEL: f24:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: subq $72, %rsp
; NOFMA-NEXT: .cfi_def_cfa_offset 80
; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
; NOFMA-NEXT: xorps %xmm3, %xmm0
; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; NOFMA-NEXT: xorps %xmm3, %xmm2
; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: callq fma
; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm0 = mem[2,3,2,3]
; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm2 = mem[2,3,2,3]
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; NOFMA-NEXT: callq fma
; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; NOFMA-NEXT: movdqa %xmm1, %xmm0
; NOFMA-NEXT: addq $72, %rsp
; NOFMA-NEXT: .cfi_def_cfa_offset 8
; NOFMA-NEXT: retq
;
; FMA-LABEL: f24:
; FMA: # %bb.0: # %entry
; FMA-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
; FMA-NEXT: retq
;
; FMA4-LABEL: f24:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: retq
entry:
%3 = fneg <2 x double> %0
%4 = fneg <2 x double> %2
%result = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %3, <2 x double> %1, <2 x double> %4,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
ret <2 x double> %result
}
define <4 x float> @f25(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 {
; NOFMA-LABEL: f25:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: subq $88, %rsp
; NOFMA-NEXT: .cfi_def_cfa_offset 96
; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
; NOFMA-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,1,2,3]
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; NOFMA-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,2,3]
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; NOFMA-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1,2,3]
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; NOFMA-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0]
; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm1
; NOFMA-NEXT: movaps %xmm1, %xmm0
; NOFMA-NEXT: addq $88, %rsp
; NOFMA-NEXT: .cfi_def_cfa_offset 8
; NOFMA-NEXT: retq
;
; FMA-AVX1-LABEL: f25:
; FMA-AVX1: # %bb.0: # %entry
; FMA-AVX1-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
; FMA-AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
; FMA-AVX1-NEXT: retq
;
; FMA4-LABEL: f25:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vfmaddps %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
; FMA4-NEXT: retq
;
; FMA-AVX512-LABEL: f25:
; FMA-AVX512: # %bb.0: # %entry
; FMA-AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0
; FMA-AVX512-NEXT: retq
entry:
%3 = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %0, <4 x float> %1, <4 x float> %2,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
%result = fneg <4 x float> %3
ret <4 x float> %result
}
define <2 x double> @f26(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 {
; NOFMA-LABEL: f26:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: subq $72, %rsp
; NOFMA-NEXT: .cfi_def_cfa_offset 80
; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; NOFMA-NEXT: callq fma
; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload
; NOFMA-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; NOFMA-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
; NOFMA-NEXT: callq fma
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; NOFMA-NEXT: xorps {{.*}}(%rip), %xmm1
; NOFMA-NEXT: movaps %xmm1, %xmm0
; NOFMA-NEXT: addq $72, %rsp
; NOFMA-NEXT: .cfi_def_cfa_offset 8
; NOFMA-NEXT: retq
;
; FMA-LABEL: f26:
; FMA: # %bb.0: # %entry
; FMA-NEXT: vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
; FMA-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: f26:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vfmaddpd %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
; FMA4-NEXT: retq
entry:
%3 = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %0, <2 x double> %1, <2 x double> %2,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
%result = fneg <2 x double> %3
ret <2 x double> %result
}
define <4 x float> @f27(<4 x float> %0, <4 x float> %1, <4 x float> %2) #0 {
; NOFMA-LABEL: f27:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: subq $88, %rsp
; NOFMA-NEXT: .cfi_def_cfa_offset 96
; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: movdqa {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; NOFMA-NEXT: pxor %xmm3, %xmm0
; NOFMA-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: pxor %xmm3, %xmm2
; NOFMA-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; NOFMA-NEXT: pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3]
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm0 = mem[2,3,2,3]
; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm2 = mem[2,3,2,3]
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; NOFMA-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm0 = mem[1,1,2,3]
; NOFMA-NEXT: pshufd $229, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm2 = mem[1,1,2,3]
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,2,3]
; NOFMA-NEXT: callq fmaf
; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; NOFMA-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm1 = xmm1[0],mem[0]
; NOFMA-NEXT: pxor {{.*}}(%rip), %xmm1
; NOFMA-NEXT: movdqa %xmm1, %xmm0
; NOFMA-NEXT: addq $88, %rsp
; NOFMA-NEXT: .cfi_def_cfa_offset 8
; NOFMA-NEXT: retq
;
; FMA-AVX1-LABEL: f27:
; FMA-AVX1: # %bb.0: # %entry
; FMA-AVX1-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
; FMA-AVX1-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
; FMA-AVX1-NEXT: retq
;
; FMA4-LABEL: f27:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vfnmsubps %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: vxorps {{.*}}(%rip), %xmm0, %xmm0
; FMA4-NEXT: retq
;
; FMA-AVX512-LABEL: f27:
; FMA-AVX512: # %bb.0: # %entry
; FMA-AVX512-NEXT: vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
; FMA-AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
; FMA-AVX512-NEXT: vxorps %xmm1, %xmm0, %xmm0
; FMA-AVX512-NEXT: retq
entry:
%3 = fneg <4 x float> %0
%4 = fneg <4 x float> %2
%5 = call <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float> %3, <4 x float> %1, <4 x float> %4,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
%result = fneg <4 x float> %5
ret <4 x float> %result
}
define <2 x double> @f28(<2 x double> %0, <2 x double> %1, <2 x double> %2) #0 {
; NOFMA-LABEL: f28:
; NOFMA: # %bb.0: # %entry
; NOFMA-NEXT: subq $72, %rsp
; NOFMA-NEXT: .cfi_def_cfa_offset 80
; NOFMA-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: movaps {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0]
; NOFMA-NEXT: xorps %xmm3, %xmm0
; NOFMA-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill
; NOFMA-NEXT: xorps %xmm3, %xmm2
; NOFMA-NEXT: movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: callq fma
; NOFMA-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; NOFMA-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm0 = mem[2,3,2,3]
; NOFMA-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
; NOFMA-NEXT: # xmm2 = mem[2,3,2,3]
; NOFMA-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1]
; NOFMA-NEXT: callq fma
; NOFMA-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; NOFMA-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; NOFMA-NEXT: pxor {{.*}}(%rip), %xmm1
; NOFMA-NEXT: movdqa %xmm1, %xmm0
; NOFMA-NEXT: addq $72, %rsp
; NOFMA-NEXT: .cfi_def_cfa_offset 8
; NOFMA-NEXT: retq
;
; FMA-LABEL: f28:
; FMA: # %bb.0: # %entry
; FMA-NEXT: vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
; FMA-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
; FMA-NEXT: retq
;
; FMA4-LABEL: f28:
; FMA4: # %bb.0: # %entry
; FMA4-NEXT: vfnmsubpd %xmm2, %xmm1, %xmm0, %xmm0
; FMA4-NEXT: vxorpd {{.*}}(%rip), %xmm0, %xmm0
; FMA4-NEXT: retq
entry:
%3 = fneg <2 x double> %0
%4 = fneg <2 x double> %2
%5 = call <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double> %3, <2 x double> %1, <2 x double> %4,
metadata !"round.dynamic",
metadata !"fpexcept.strict") #0
%result = fneg <2 x double> %5
ret <2 x double> %result
}
attributes #0 = { strictfp } attributes #0 = { strictfp }
declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata) declare float @llvm.experimental.constrained.fma.f32(float, float, float, metadata, metadata)
declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata) declare double @llvm.experimental.constrained.fma.f64(double, double, double, metadata, metadata)
declare <4 x float> @llvm.experimental.constrained.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, metadata, metadata)
declare <2 x double> @llvm.experimental.constrained.fma.v2f64(<2 x double>, <2 x double>, <2 x double>, metadata, metadata)