1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 03:02:36 +01:00

[ARM] Armv8.2-A FP16 code generation (part 3/3)

This adds most of the FP16 codegen support, but these areas need further work:

- FP16 literals and immediates are not properly supported yet (e.g. literal
  pool needs work),
- Instructions that are generated from intrinsics (e.g. vabs) haven't been
  added.

This will be addressed in follow-up patches.

Differential Revision: https://reviews.llvm.org/D42849

llvm-svn: 324321
This commit is contained in:
Sjoerd Meijer 2018-02-06 08:43:56 +00:00
parent 5442996944
commit 7941278b18
4 changed files with 673 additions and 42 deletions

View File

@ -1042,6 +1042,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
setOperationAction(ISD::SETCC, MVT::i32, Expand);
setOperationAction(ISD::SETCC, MVT::f16, Expand);
setOperationAction(ISD::SETCC, MVT::f32, Expand);
setOperationAction(ISD::SETCC, MVT::f64, Expand);
setOperationAction(ISD::SELECT, MVT::i32, Custom);
@ -12746,6 +12747,24 @@ bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
return false;
}
bool ARMTargetLowering::isFNegFree(EVT VT) const {
if (!VT.isSimple())
return false;
// There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
// negate values directly (fneg is free). So, we don't want to let the DAG
// combiner rewrite fneg into xors and some other instructions. For f16 and
// FullFP16 argument passing, some bitcast nodes may be introduced,
// triggering this DAG combine rewrite, so we are avoiding that with this.
switch (VT.getSimpleVT().SimpleTy) {
default: break;
case MVT::f16:
return Subtarget->hasFullFP16();
}
return false;
}
bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
EVT VT = ExtVal.getValueType();
@ -13842,6 +13861,8 @@ bool ARM::isBitFieldInvertedMask(unsigned v) {
bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
if (!Subtarget->hasVFP3())
return false;
if (VT == MVT::f16 && Subtarget->hasFullFP16())
return ARM_AM::getFP16Imm(Imm) != -1;
if (VT == MVT::f32)
return ARM_AM::getFP32Imm(Imm) != -1;
if (VT == MVT::f64 && !Subtarget->isFPOnlySP())

View File

@ -331,6 +331,7 @@ class VectorType;
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override;
bool isTruncateFree(EVT SrcVT, EVT DstVT) const override;
bool isZExtFree(SDValue Val, EVT VT2) const override;
bool isFNegFree(EVT VT) const override;
bool isVectorLoadExtDesirable(SDValue ExtVal) const override;

View File

@ -395,9 +395,9 @@ def VDIVS : ASbI<0b11101, 0b00, 0, 0,
let TwoOperandAliasConstraint = "$Sn = $Sd" in
def VDIVH : AHbI<0b11101, 0b00, 0, 0,
(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm",
[]>,
[(set HPR:$Sd, (fdiv HPR:$Sn, HPR:$Sm))]>,
Sched<[WriteFPDIV32]>;
let TwoOperandAliasConstraint = "$Dn = $Dd" in
@ -420,9 +420,9 @@ def VMULS : ASbIn<0b11100, 0b10, 0, 0,
let TwoOperandAliasConstraint = "$Sn = $Sd" in
def VMULH : AHbI<0b11100, 0b10, 0, 0,
(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm",
[]>,
[(set HPR:$Sd, (fmul HPR:$Sn, HPR:$Sm))]>,
Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>;
def VNMULD : ADbI<0b11100, 0b10, 1, 0,
@ -442,9 +442,9 @@ def VNMULS : ASbI<0b11100, 0b10, 1, 0,
}
def VNMULH : AHbI<0b11100, 0b10, 1, 0,
(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm",
[]>,
[(set HPR:$Sd, (fneg (fmul HPR:$Sn, HPR:$Sm)))]>,
Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>;
multiclass vsel_inst<string op, bits<2> opc, int CC> {
@ -525,9 +525,9 @@ def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
}
def VCMPEH : AHuI<0b11101, 0b11, 0b0100, 0b11, 0,
(outs), (ins SPR:$Sd, SPR:$Sm),
(outs), (ins HPR:$Sd, HPR:$Sm),
IIC_fpCMP16, "vcmpe", ".f16\t$Sd, $Sm",
[]>;
[(arm_cmpfp HPR:$Sd, HPR:$Sm, (i32 1))]>;
def VCMPD : ADuI<0b11101, 0b11, 0b0100, 0b01, 0,
(outs), (ins DPR:$Dd, DPR:$Dm),
@ -544,9 +544,9 @@ def VCMPS : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
}
def VCMPH : AHuI<0b11101, 0b11, 0b0100, 0b01, 0,
(outs), (ins SPR:$Sd, SPR:$Sm),
(outs), (ins HPR:$Sd, HPR:$Sm),
IIC_fpCMP16, "vcmp", ".f16\t$Sd, $Sm",
[]>;
[(arm_cmpfp HPR:$Sd, HPR:$Sm, (i32 0))]>;
} // Defs = [FPSCR_NZCV]
//===----------------------------------------------------------------------===//
@ -771,7 +771,7 @@ multiclass vcvt_inst<string opc, bits<2> rm,
SDPatternOperator node = null_frag> {
let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
def SH : AHuInp<0b11101, 0b11, 0b1100, 0b11, 0,
(outs SPR:$Sd), (ins SPR:$Sm),
(outs SPR:$Sd), (ins HPR:$Sm),
NoItinerary, !strconcat("vcvt", opc, ".s32.f16\t$Sd, $Sm"),
[]>,
Requires<[HasFullFP16]> {
@ -779,7 +779,7 @@ multiclass vcvt_inst<string opc, bits<2> rm,
}
def UH : AHuInp<0b11101, 0b11, 0b1100, 0b01, 0,
(outs SPR:$Sd), (ins SPR:$Sm),
(outs SPR:$Sd), (ins HPR:$Sm),
NoItinerary, !strconcat("vcvt", opc, ".u32.f16\t$Sd, $Sm"),
[]>,
Requires<[HasFullFP16]> {
@ -834,6 +834,17 @@ multiclass vcvt_inst<string opc, bits<2> rm,
}
let Predicates = [HasFPARMv8] in {
let Predicates = [HasFullFP16] in {
def : Pat<(i32 (fp_to_sint (node HPR:$a))),
(COPY_TO_REGCLASS
(!cast<Instruction>(NAME#"SH") HPR:$a),
GPR)>;
def : Pat<(i32 (fp_to_uint (node HPR:$a))),
(COPY_TO_REGCLASS
(!cast<Instruction>(NAME#"UH") HPR:$a),
GPR)>;
}
def : Pat<(i32 (fp_to_sint (node SPR:$a))),
(COPY_TO_REGCLASS
(!cast<Instruction>(NAME#"SS") SPR:$a),
@ -875,9 +886,9 @@ def VNEGS : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0,
}
def VNEGH : AHuI<0b11101, 0b11, 0b0001, 0b01, 0,
(outs SPR:$Sd), (ins SPR:$Sm),
(outs HPR:$Sd), (ins HPR:$Sm),
IIC_fpUNA16, "vneg", ".f16\t$Sd, $Sm",
[]>;
[(set HPR:$Sd, (fneg HPR:$Sm))]>;
multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> {
def H : AHuI<0b11101, 0b11, 0b0110, 0b11, 0,
@ -1313,13 +1324,16 @@ def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
(VSITOS (VLDRS addrmode5:$a))>;
def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
(outs SPR:$Sd), (ins SPR:$Sm),
(outs HPR:$Sd), (ins SPR:$Sm),
IIC_fpCVTIH, "vcvt", ".f16.s32\t$Sd, $Sm",
[]>,
Sched<[WriteFPCVT]> {
let Inst{7} = 1; // s32
}
def : VFPNoNEONPat<(f16 (sint_to_fp GPR:$a)),
(VSITOH (COPY_TO_REGCLASS GPR:$a, SPR))>;
def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
(outs DPR:$Dd), (ins SPR:$Sm),
IIC_fpCVTID, "vcvt", ".f64.u32\t$Dd, $Sm",
@ -1355,13 +1369,16 @@ def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
(VUITOS (VLDRS addrmode5:$a))>;
def VUITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
(outs SPR:$Sd), (ins SPR:$Sm),
(outs HPR:$Sd), (ins SPR:$Sm),
IIC_fpCVTIH, "vcvt", ".f16.u32\t$Sd, $Sm",
[]>,
Sched<[WriteFPCVT]> {
let Inst{7} = 0; // u32
}
def : VFPNoNEONPat<(f16 (uint_to_fp GPR:$a)),
(VUITOH (COPY_TO_REGCLASS GPR:$a, SPR))>;
// FP -> Int:
class AVConv1IsD_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
@ -1456,13 +1473,16 @@ def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))),
(VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>;
def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
(outs SPR:$Sd), (ins SPR:$Sm),
(outs SPR:$Sd), (ins HPR:$Sm),
IIC_fpCVTHI, "vcvt", ".s32.f16\t$Sd, $Sm",
[]>,
Sched<[WriteFPCVT]> {
let Inst{7} = 1; // Z bit
}
def : VFPNoNEONPat<(i32 (fp_to_sint HPR:$a)),
(COPY_TO_REGCLASS (VTOSIZH HPR:$a), GPR)>;
def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
(outs SPR:$Sd), (ins DPR:$Dm),
IIC_fpCVTDI, "vcvt", ".u32.f64\t$Sd, $Dm",
@ -1499,13 +1519,16 @@ def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))),
(VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>;
def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
(outs SPR:$Sd), (ins SPR:$Sm),
(outs SPR:$Sd), (ins HPR:$Sm),
IIC_fpCVTHI, "vcvt", ".u32.f16\t$Sd, $Sm",
[]>,
Sched<[WriteFPCVT]> {
let Inst{7} = 1; // Z bit
}
def : VFPNoNEONPat<(i32 (fp_to_uint HPR:$a)),
(COPY_TO_REGCLASS (VTOUIZH HPR:$a), GPR)>;
// And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
let Uses = [FPSCR] in {
def VTOSIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
@ -1789,9 +1812,10 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
}
def VMLAH : AHbI<0b11100, 0b00, 0, 0,
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpMAC16, "vmla", ".f16\t$Sd, $Sn, $Sm",
[]>,
[(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm),
HPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
@ -1801,6 +1825,10 @@ def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
(VMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
(VMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
def VMLSD : ADbI<0b11100, 0b00, 1, 0,
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@ -1825,9 +1853,10 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
}
def VMLSH : AHbI<0b11100, 0b00, 1, 0,
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpMAC16, "vmls", ".f16\t$Sd, $Sn, $Sm",
[]>,
[(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
HPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
@ -1837,6 +1866,9 @@ def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
(VMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
(VMLSH HPR:$dstin, HPR:$a, HPR:$b)>,
Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@ -1861,9 +1893,10 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
}
def VNMLAH : AHbI<0b11100, 0b01, 1, 0,
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpMAC16, "vnmla", ".f16\t$Sd, $Sn, $Sm",
[]>,
[(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
HPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
@ -1874,6 +1907,9 @@ def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
(VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
def : Pat<(fsub_mlx (fneg (fmul_su HPR:$a, HPR:$b)), HPR:$dstin),
(VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
// (-dst - (a * b)) -> -(dst + (a * b))
def : Pat<(fsub_mlx (fneg DPR:$dstin), (fmul_su DPR:$a, (f64 DPR:$b))),
@ -1882,6 +1918,9 @@ def : Pat<(fsub_mlx (fneg DPR:$dstin), (fmul_su DPR:$a, (f64 DPR:$b))),
def : Pat<(fsub_mlx (fneg SPR:$dstin), (fmul_su SPR:$a, SPR:$b)),
(VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
def : Pat<(fsub_mlx (fneg HPR:$dstin), (fmul_su HPR:$a, HPR:$b)),
(VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
@ -1905,9 +1944,9 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
}
def VNMLSH : AHbI<0b11100, 0b01, 0, 0,
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpMAC16, "vnmls", ".f16\t$Sd, $Sn, $Sm",
[]>,
[(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
@ -1917,6 +1956,9 @@ def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
(VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
def : Pat<(fsub_mlx (fmul_su HPR:$a, HPR:$b), HPR:$dstin),
(VNMLSH HPR:$dstin, HPR:$a, HPR:$b)>,
Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
//===----------------------------------------------------------------------===//
// Fused FP Multiply-Accumulate Operations.
@ -1943,9 +1985,10 @@ def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
}
def VFMAH : AHbI<0b11101, 0b10, 0, 0,
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpFMAC16, "vfma", ".f16\t$Sd, $Sn, $Sm",
[]>,
[(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm),
HPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFusedMAC]>,
Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
@ -1956,6 +1999,9 @@ def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
(VFMAS SPR:$dstin, SPR:$a, SPR:$b)>,
Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
(VFMAH HPR:$dstin, HPR:$a, HPR:$b)>,
Requires<[HasFullFP16,DontUseNEONForFP,UseFusedMAC]>;
// Match @llvm.fma.* intrinsics
// (fma x, y, z) -> (vfms z, x, y)
@ -1988,9 +2034,10 @@ def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
}
def VFMSH : AHbI<0b11101, 0b10, 1, 0,
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpFMAC16, "vfms", ".f16\t$Sd, $Sn, $Sm",
[]>,
[(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
HPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFusedMAC]>,
Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
@ -2001,6 +2048,9 @@ def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
(VFMSS SPR:$dstin, SPR:$a, SPR:$b)>,
Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
(VFMSH HPR:$dstin, HPR:$a, HPR:$b)>,
Requires<[HasFullFP16,DontUseNEONForFP,UseFusedMAC]>;
// Match @llvm.fma.* intrinsics
// (fma (fneg x), y, z) -> (vfms z, x, y)
@ -2040,9 +2090,10 @@ def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
}
def VFNMAH : AHbI<0b11101, 0b01, 1, 0,
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpFMAC16, "vfnma", ".f16\t$Sd, $Sn, $Sm",
[]>,
[(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
HPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFusedMAC]>,
Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
@ -2091,9 +2142,9 @@ def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
}
def VFNMSH : AHbI<0b11101, 0b01, 0, 0,
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
IIC_fpFMAC16, "vfnms", ".f16\t$Sd, $Sn, $Sm",
[]>,
[(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>,
RegConstraint<"$Sdin = $Sd">,
Requires<[HasFullFP16,UseFusedMAC]>,
Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;

View File

@ -1,29 +1,44 @@
; SOFT:
; RUN: llc < %s -mtriple=arm-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT
; RUN: llc < %s -mtriple=thumb-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT
; SOFTFP:
; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+vfp3 | FileCheck %s --check-prefixes=CHECK,CHECK-SOFTFP-VFP3
; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+vfp4 | FileCheck %s --check-prefixes=CHECK,CHECK-SOFTFP-FP16
; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-SOFTFP-FULLFP16
; RUN: llc < %s -mtriple=thumbv7-none-eabi -mattr=+vfp3 | FileCheck %s --check-prefixes=CHECK,CHECK-SOFTFP-VFP3
; RUN: llc < %s -mtriple=thumbv7-none-eabi -mattr=+vfp4 | FileCheck %s --check-prefixes=CHECK,CHECK-SOFTFP-FP16
; RUN: llc < %s -mtriple=thumbv7-none-eabi -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-SOFTFP-FULLFP16
; HARD:
; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+vfp3 | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-VFP3
; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+vfp4 | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-FP16
; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-FULLFP16
; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mattr=+vfp3 | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-VFP3
; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mattr=+vfp4 | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-FP16
; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-FULLFP16
define float @RetValBug(float %A.coerce) local_unnamed_addr {
; FP-CONTRACT=FAST
; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+fullfp16 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-FULLFP16-FAST
; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mattr=+fullfp16 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-FULLFP16-FAST
define float @RetValBug(float %A.coerce) {
entry:
ret float undef
; This expression is optimised away due to the undef value. Check that
; LowerReturn can handle undef nodes (i.e. nodes which do not have any
; operands) when FullFP16 is enabled.
; Check thatLowerReturn can handle undef nodes (i.e. nodes which do not have
; any operands) when FullFP16 is enabled.
;
; CHECK-LABEL: RetValBug:
; CHECK-HARDFP-FULLFP16: mov pc, lr
; CHECK-HARDFP-FULLFP16: {{.*}} lr
}
define float @Add(float %a.coerce, float %b.coerce) local_unnamed_addr {
; 1. VABS: TODO
; 2. VADD
define float @Add(float %a.coerce, float %b.coerce) {
entry:
%0 = bitcast float %a.coerce to i32
%tmp.0.extract.trunc = trunc i32 %0 to i16
@ -61,7 +76,6 @@ entry:
; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S2:s[0-9]]], r0
; CHECK-SOFTFP-FULLFP16: vadd.f16 [[S0]], [[S2]], [[S0]]
; CHECK-SOFTFP-FULLFP16-NEXT: vmov.f16 r0, s0
; CHECK-SOFTFP-FULLFP16-NEXT: mov pc, lr
; CHECK-HARDFP-VFP3: vmov r{{.}}, s0
; CHECK-HARDFP-VFP3: vmov{{.*}}, s1
@ -77,5 +91,549 @@ entry:
; CHECK-HARDFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]]
; CHECK-HARDFP-FULLFP16: vadd.f16 s0, s0, s1
; CHECK-HARDFP-FULLFP16-NEXT: mov pc, lr
}
; 3. VCMP
define zeroext i1 @VCMP(float %F.coerce, float %G.coerce) {
entry:
%0 = bitcast float %F.coerce to i32
%tmp.0.extract.trunc = trunc i32 %0 to i16
%1 = bitcast i16 %tmp.0.extract.trunc to half
%2 = bitcast float %G.coerce to i32
%tmp1.0.extract.trunc = trunc i32 %2 to i16
%3 = bitcast i16 %tmp1.0.extract.trunc to half
%cmp = fcmp ogt half %1, %3
ret i1 %cmp
; CHECK-LABEL: VCMP:
; CHECK-SOFT: bl __aeabi_fcmpgt
; CHECK-SOFTFP-VFP3: bl __aeabi_h2f
; CHECK-SOFTFP-VFP3: bl __aeabi_h2f
; CHECK-SOFTFP-VFP3: vcmpe.f32 s{{.}}, s{{.}}
; CHECK-SOFTFP-FP16: vcvtb.f32.f16 s{{.}}, s{{.}}
; CHECK-SOFTFP-FP16: vcvtb.f32.f16 s{{.}}, s{{.}}
; CHECK-SOFTFP-FP16: vcmpe.f32 s{{.}}, s{{.}}
; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S2:s[0-9]]], r0
; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S0:s[0-9]]], r1
; CHECK-SOFTFP-FULLFP16: vcmpe.f16 [[S2]], [[S0]]
; CHECK-SOFTFP-FULLFP16-NOT: vmov.f16 s{{.}}, r0
; CHECK-SOFTFP-FULLFP16-NOT: vmov.f16 s{{.}}, r1
; CHECK-HARDFP-FULLFP16: vcmpe.f16 s0, s1
}
; 4. VCMPE
; FIXME: enable when constant pool is fixed
;
;define i32 @VCMPE_IMM(float %F.coerce) {
;entry:
; %0 = bitcast float %F.coerce to i32
; %tmp.0.extract.trunc = trunc i32 %0 to i16
; %1 = bitcast i16 %tmp.0.extract.trunc to half
; %tmp = fcmp olt half %1, 1.000000e+00
; %tmp1 = zext i1 %tmp to i32
; ret i32 %tmp1
;}
define i32 @VCMPE(float %F.coerce, float %G.coerce) {
entry:
%0 = bitcast float %F.coerce to i32
%tmp.0.extract.trunc = trunc i32 %0 to i16
%1 = bitcast i16 %tmp.0.extract.trunc to half
%2 = bitcast float %G.coerce to i32
%tmp.1.extract.trunc = trunc i32 %2 to i16
%3 = bitcast i16 %tmp.1.extract.trunc to half
%tmp = fcmp olt half %1, %3
%tmp1 = zext i1 %tmp to i32
ret i32 %tmp1
; CHECK-LABEL: VCMPE:
}
; 5. VCVT (between floating-point and fixed-point)
; Only assembly/disassembly support
; 6. VCVT (between floating-point and integer, both directions)
define i32 @fptosi(i32 %A.coerce) {
entry:
%tmp.0.extract.trunc = trunc i32 %A.coerce to i16
%0 = bitcast i16 %tmp.0.extract.trunc to half
%conv = fptosi half %0 to i32
ret i32 %conv
; CHECK-LABEL: fptosi:
; CHECK-HARDFP-FULLFP16: vmov.f16 s0, r0
; CHECK-HARDFP-FULLFP16-NEXT: vcvt.s32.f16 s0, s0
; CHECK-HARDFP-FULLFP16-NEXT: vmov r0, s0
}
define i32 @fptoui(i32 %A.coerce) {
entry:
%tmp.0.extract.trunc = trunc i32 %A.coerce to i16
%0 = bitcast i16 %tmp.0.extract.trunc to half
%conv = fptoui half %0 to i32
ret i32 %conv
; CHECK-HARDFP-FULLFP16: vcvt.u32.f16 s0, s0
; CHECK-HARDFP-FULLFP16-NEXT: vmov r0, s0
}
define float @UintToH(i32 %a, i32 %b) {
entry:
%0 = uitofp i32 %a to half
%1 = bitcast half %0 to i16
%tmp0.insert.ext = zext i16 %1 to i32
%2 = bitcast i32 %tmp0.insert.ext to float
ret float %2
; CHECK-LABEL: UintToH:
; CHECK-HARDFP-FULLFP16: vmov s0, r0
; CHECK-HARDFP-FULLFP16-NEXT: vcvt.f16.u32 s0, s0
}
define float @SintToH(i32 %a, i32 %b) {
entry:
%0 = sitofp i32 %a to half
%1 = bitcast half %0 to i16
%tmp0.insert.ext = zext i16 %1 to i32
%2 = bitcast i32 %tmp0.insert.ext to float
ret float %2
; CHECK-LABEL: SintToH:
; CHECK-HARDFP-FULLFP16: vmov s0, r0
; CHECK-HARDFP-FULLFP16-NEXT: vcvt.f16.s32 s0, s0
}
; TODO:
; 7. VCVTA
; 8. VCVTM
; 9. VCVTN
; 10. VCVTP
; 11. VCVTR
; 12. VDIV
define float @Div(float %a.coerce, float %b.coerce) {
entry:
%0 = bitcast float %a.coerce to i32
%tmp.0.extract.trunc = trunc i32 %0 to i16
%1 = bitcast i16 %tmp.0.extract.trunc to half
%2 = bitcast float %b.coerce to i32
%tmp1.0.extract.trunc = trunc i32 %2 to i16
%3 = bitcast i16 %tmp1.0.extract.trunc to half
%add = fdiv half %1, %3
%4 = bitcast half %add to i16
%tmp4.0.insert.ext = zext i16 %4 to i32
%5 = bitcast i32 %tmp4.0.insert.ext to float
ret float %5
; CHECK-LABEL: Div:
; CHECK-SOFT: bl __aeabi_h2f
; CHECK-SOFT: bl __aeabi_h2f
; CHECK-SOFT: bl __aeabi_fdiv
; CHECK-SOFT: bl __aeabi_f2h
; CHECK-SOFTFP-VFP3: bl __aeabi_h2f
; CHECK-SOFTFP-VFP3: bl __aeabi_h2f
; CHECK-SOFTFP-VFP3: vdiv.f32
; CHECK-SOFTFP-VFP3: bl __aeabi_f2h
; CHECK-SOFTFP-FP16: vmov [[S2:s[0-9]]], r1
; CHECK-SOFTFP-FP16: vmov [[S0:s[0-9]]], r0
; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S2]], [[S2]]
; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S0]], [[S0]]
; CHECK-SOFTFP-FP16: vdiv.f32 [[S0]], [[S0]], [[S2]]
; CHECK-SOFTFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]]
; CHECK-SOFTFP-FP16: vmov r0, s0
; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S0:s[0-9]]], r1
; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S2:s[0-9]]], r0
; CHECK-SOFTFP-FULLFP16: vdiv.f16 [[S0]], [[S2]], [[S0]]
; CHECK-SOFTFP-FULLFP16-NEXT: vmov.f16 r0, s0
; CHECK-HARDFP-VFP3: vmov r{{.}}, s0
; CHECK-HARDFP-VFP3: vmov{{.*}}, s1
; CHECK-HARDFP-VFP3: bl __aeabi_h2f
; CHECK-HARDFP-VFP3: bl __aeabi_h2f
; CHECK-HARDFP-VFP3: vdiv.f32
; CHECK-HARDFP-VFP3: bl __aeabi_f2h
; CHECK-HARDFP-VFP3: vmov s0, r0
; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S2:s[0-9]]], s1
; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S0:s[0-9]]], s0
; CHECK-HARDFP-FP16: vdiv.f32 [[S0]], [[S0]], [[S2]]
; CHECK-HARDFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]]
; CHECK-HARDFP-FULLFP16: vdiv.f16 s0, s0, s1
}
; 13. VFMA
define float @VFMA(float %a.coerce, float %b.coerce, float %c.coerce) {
entry:
%0 = bitcast float %a.coerce to i32
%tmp.0.extract.trunc = trunc i32 %0 to i16
%1 = bitcast i16 %tmp.0.extract.trunc to half
%2 = bitcast float %b.coerce to i32
%tmp1.0.extract.trunc = trunc i32 %2 to i16
%3 = bitcast i16 %tmp1.0.extract.trunc to half
%4 = bitcast float %c.coerce to i32
%tmp2.0.extract.trunc = trunc i32 %4 to i16
%5 = bitcast i16 %tmp2.0.extract.trunc to half
%mul = fmul half %1, %3
%add = fadd half %mul, %5
%6 = bitcast half %add to i16
%tmp4.0.insert.ext = zext i16 %6 to i32
%7 = bitcast i32 %tmp4.0.insert.ext to float
ret float %7
; CHECK-LABEL: VFMA:
; CHECK-HARDFP-FULLFP16-FAST: vfma.f16 s2, s0, s1
; CHECK-HARDFP-FULLFP16-FAST-NEXT: vmov.f32 s0, s2
}
; 14. VFMS
define float @VFMS(float %a.coerce, float %b.coerce, float %c.coerce) {
entry:
%0 = bitcast float %a.coerce to i32
%tmp.0.extract.trunc = trunc i32 %0 to i16
%1 = bitcast i16 %tmp.0.extract.trunc to half
%2 = bitcast float %b.coerce to i32
%tmp1.0.extract.trunc = trunc i32 %2 to i16
%3 = bitcast i16 %tmp1.0.extract.trunc to half
%4 = bitcast float %c.coerce to i32
%tmp2.0.extract.trunc = trunc i32 %4 to i16
%5 = bitcast i16 %tmp2.0.extract.trunc to half
%mul = fmul half %1, %3
%sub = fsub half %5, %mul
%6 = bitcast half %sub to i16
%tmp4.0.insert.ext = zext i16 %6 to i32
%7 = bitcast i32 %tmp4.0.insert.ext to float
ret float %7
; CHECK-LABEL: VFMS:
; CHECK-HARDFP-FULLFP16-FAST: vfms.f16 s2, s0, s1
; CHECK-HARDFP-FULLFP16-FAST-NEXT: vmov.f32 s0, s2
}
; 15. VFNMA
define float @VFNMA(float %a.coerce, float %b.coerce, float %c.coerce) {
entry:
%0 = bitcast float %a.coerce to i32
%tmp.0.extract.trunc = trunc i32 %0 to i16
%1 = bitcast i16 %tmp.0.extract.trunc to half
%2 = bitcast float %b.coerce to i32
%tmp1.0.extract.trunc = trunc i32 %2 to i16
%3 = bitcast i16 %tmp1.0.extract.trunc to half
%4 = bitcast float %c.coerce to i32
%tmp2.0.extract.trunc = trunc i32 %4 to i16
%5 = bitcast i16 %tmp2.0.extract.trunc to half
%mul = fmul half %1, %3
%sub = fsub half -0.0, %mul
%sub2 = fsub half %sub, %5
%6 = bitcast half %sub2 to i16
%tmp4.0.insert.ext = zext i16 %6 to i32
%7 = bitcast i32 %tmp4.0.insert.ext to float
ret float %7
; CHECK-LABEL: VFNMA:
; CHECK-HARDFP-FULLFP16-FAST: vfnma.f16 s2, s0, s1
; CHECK-HARDFP-FULLFP16-FAST-NEXT: vmov.f32 s0, s2
}
; 16. VFNMS
define float @VFNMS(float %a.coerce, float %b.coerce, float %c.coerce) {
entry:
%0 = bitcast float %a.coerce to i32
%tmp.0.extract.trunc = trunc i32 %0 to i16
%1 = bitcast i16 %tmp.0.extract.trunc to half
%2 = bitcast float %b.coerce to i32
%tmp1.0.extract.trunc = trunc i32 %2 to i16
%3 = bitcast i16 %tmp1.0.extract.trunc to half
%4 = bitcast float %c.coerce to i32
%tmp2.0.extract.trunc = trunc i32 %4 to i16
%5 = bitcast i16 %tmp2.0.extract.trunc to half
%mul = fmul half %1, %3
%sub2 = fsub half %mul, %5
%6 = bitcast half %sub2 to i16
%tmp4.0.insert.ext = zext i16 %6 to i32
%7 = bitcast i32 %tmp4.0.insert.ext to float
ret float %7
; CHECK-LABEL: VFNMS:
; CHECK-HARDFP-FULLFP16-FAST: vfnms.f16 s2, s0, s1
; CHECK-HARDFP-FULLFP16-FAST-NEXT: vmov.f32 s0, s2
}
; TODO:
; 17. VMAXNM
; 18. VMINNM
; 19. VMLA
define float @VMLA(float %a.coerce, float %b.coerce, float %c.coerce) {
entry:
%0 = bitcast float %a.coerce to i32
%tmp.0.extract.trunc = trunc i32 %0 to i16
%1 = bitcast i16 %tmp.0.extract.trunc to half
%2 = bitcast float %b.coerce to i32
%tmp1.0.extract.trunc = trunc i32 %2 to i16
%3 = bitcast i16 %tmp1.0.extract.trunc to half
%4 = bitcast float %c.coerce to i32
%tmp2.0.extract.trunc = trunc i32 %4 to i16
%5 = bitcast i16 %tmp2.0.extract.trunc to half
%mul = fmul half %1, %3
%add = fadd half %5, %mul
%6 = bitcast half %add to i16
%tmp4.0.insert.ext = zext i16 %6 to i32
%7 = bitcast i32 %tmp4.0.insert.ext to float
ret float %7
; CHECK-LABEL: VMLA:
; CHECK-HARDFP-FULLFP16: vmla.f16 s2, s0, s1
; CHECK-HARDFP-FULLFP16-NEXT: vmov.f32 s0, s2
}
; 20. VMLS
define float @VMLS(float %a.coerce, float %b.coerce, float %c.coerce) {
entry:
%0 = bitcast float %a.coerce to i32
%tmp.0.extract.trunc = trunc i32 %0 to i16
%1 = bitcast i16 %tmp.0.extract.trunc to half
%2 = bitcast float %b.coerce to i32
%tmp1.0.extract.trunc = trunc i32 %2 to i16
%3 = bitcast i16 %tmp1.0.extract.trunc to half
%4 = bitcast float %c.coerce to i32
%tmp2.0.extract.trunc = trunc i32 %4 to i16
%5 = bitcast i16 %tmp2.0.extract.trunc to half
%mul = fmul half %1, %3
%add = fsub half %5, %mul
%6 = bitcast half %add to i16
%tmp4.0.insert.ext = zext i16 %6 to i32
%7 = bitcast i32 %tmp4.0.insert.ext to float
ret float %7
; CHECK-LABEL: VMLS:
; CHECK-HARDFP-FULLFP16: vmls.f16 s2, s0, s1
; CHECK-HARDFP-FULLFP16-NEXT: vmov.f32 s0, s2
}
; TODO: fix immediates.
; 21. VMOV (between general-purpose register and half-precision register)
; 22. VMOV (immediate)
; 23. VMUL
define float @Mul(float %a.coerce, float %b.coerce) {
entry:
%0 = bitcast float %a.coerce to i32
%tmp.0.extract.trunc = trunc i32 %0 to i16
%1 = bitcast i16 %tmp.0.extract.trunc to half
%2 = bitcast float %b.coerce to i32
%tmp1.0.extract.trunc = trunc i32 %2 to i16
%3 = bitcast i16 %tmp1.0.extract.trunc to half
%add = fmul half %1, %3
%4 = bitcast half %add to i16
%tmp4.0.insert.ext = zext i16 %4 to i32
%5 = bitcast i32 %tmp4.0.insert.ext to float
ret float %5
; CHECK-LABEL: Mul:
; CHECK-SOFT: bl __aeabi_h2f
; CHECK-SOFT: bl __aeabi_h2f
; CHECK-SOFT: bl __aeabi_fmul
; CHECK-SOFT: bl __aeabi_f2h
; CHECK-SOFTFP-VFP3: bl __aeabi_h2f
; CHECK-SOFTFP-VFP3: bl __aeabi_h2f
; CHECK-SOFTFP-VFP3: vmul.f32
; CHECK-SOFTFP-VFP3: bl __aeabi_f2h
; CHECK-SOFTFP-FP16: vmov [[S2:s[0-9]]], r1
; CHECK-SOFTFP-FP16: vmov [[S0:s[0-9]]], r0
; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S2]], [[S2]]
; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S0]], [[S0]]
; CHECK-SOFTFP-FP16: vmul.f32 [[S0]], [[S0]], [[S2]]
; CHECK-SOFTFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]]
; CHECK-SOFTFP-FP16: vmov r0, s0
; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S0:s[0-9]]], r1
; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S2:s[0-9]]], r0
; CHECK-SOFTFP-FULLFP16: vmul.f16 [[S0]], [[S2]], [[S0]]
; CHECK-SOFTFP-FULLFP16-NEXT: vmov.f16 r0, s0
; CHECK-HARDFP-VFP3: vmov r{{.}}, s0
; CHECK-HARDFP-VFP3: vmov{{.*}}, s1
; CHECK-HARDFP-VFP3: bl __aeabi_h2f
; CHECK-HARDFP-VFP3: bl __aeabi_h2f
; CHECK-HARDFP-VFP3: vmul.f32
; CHECK-HARDFP-VFP3: bl __aeabi_f2h
; CHECK-HARDFP-VFP3: vmov s0, r0
; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S2:s[0-9]]], s1
; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S0:s[0-9]]], s0
; CHECK-HARDFP-FP16: vmul.f32 [[S0]], [[S0]], [[S2]]
; CHECK-HARDFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]]
; CHECK-HARDFP-FULLFP16: vmul.f16 s0, s0, s1
}
; 24. VNEG
define float @Neg(float %a.coerce) {
entry:
%0 = bitcast float %a.coerce to i32
%tmp.0.extract.trunc = trunc i32 %0 to i16
%1 = bitcast i16 %tmp.0.extract.trunc to half
%2 = fsub half -0.000000e+00, %1
%3 = bitcast half %2 to i16
%tmp4.0.insert.ext = zext i16 %3 to i32
%4 = bitcast i32 %tmp4.0.insert.ext to float
ret float %4
; CHECK-LABEL: Neg:
; CHECK-HARDFP-FULLFP16: vneg.f16 s0, s0
}
; 25. VNMLA
define float @VNMLA(float %a.coerce, float %b.coerce, float %c.coerce) {
entry:
%0 = bitcast float %a.coerce to i32
%tmp.0.extract.trunc = trunc i32 %0 to i16
%1 = bitcast i16 %tmp.0.extract.trunc to half
%2 = bitcast float %b.coerce to i32
%tmp1.0.extract.trunc = trunc i32 %2 to i16
%3 = bitcast i16 %tmp1.0.extract.trunc to half
%4 = bitcast float %c.coerce to i32
%tmp2.0.extract.trunc = trunc i32 %4 to i16
%5 = bitcast i16 %tmp2.0.extract.trunc to half
%add = fmul half %1, %3
%add2 = fsub half -0.000000e+00, %add
%add3 = fsub half %add2, %5
%6 = bitcast half %add3 to i16
%tmp4.0.insert.ext = zext i16 %6 to i32
%7 = bitcast i32 %tmp4.0.insert.ext to float
ret float %7
; CHECK-LABEL: VNMLA:
; CHECK-HARDFP-FULLFP16: vnmla.f16 s2, s0, s1
; CHECK-HARDFP-FULLFP16: vmov.f32 s0, s2
}
; 26. VNMLS
define float @VNMLS(float %a.coerce, float %b.coerce, float %c.coerce) {
entry:
%0 = bitcast float %a.coerce to i32
%tmp.0.extract.trunc = trunc i32 %0 to i16
%1 = bitcast i16 %tmp.0.extract.trunc to half
%2 = bitcast float %b.coerce to i32
%tmp1.0.extract.trunc = trunc i32 %2 to i16
%3 = bitcast i16 %tmp1.0.extract.trunc to half
%4 = bitcast float %c.coerce to i32
%tmp2.0.extract.trunc = trunc i32 %4 to i16
%5 = bitcast i16 %tmp2.0.extract.trunc to half
%add = fmul half %1, %3
%add2 = fsub half %add, %5
%6 = bitcast half %add2 to i16
%tmp4.0.insert.ext = zext i16 %6 to i32
%7 = bitcast i32 %tmp4.0.insert.ext to float
ret float %7
; CHECK-LABEL: VNMLS:
; CHECK-HARDFP-FULLFP16: vnmls.f16 s2, s0, s1
; CHECK-HARDFP-FULLFP16: vmov.f32 s0, s2
}
; 27. VNMUL
define float @NMul(float %a.coerce, float %b.coerce) {
entry:
%0 = bitcast float %a.coerce to i32
%tmp.0.extract.trunc = trunc i32 %0 to i16
%1 = bitcast i16 %tmp.0.extract.trunc to half
%2 = bitcast float %b.coerce to i32
%tmp1.0.extract.trunc = trunc i32 %2 to i16
%3 = bitcast i16 %tmp1.0.extract.trunc to half
%add = fmul half %1, %3
%add2 = fsub half -0.0, %add
%4 = bitcast half %add2 to i16
%tmp4.0.insert.ext = zext i16 %4 to i32
%5 = bitcast i32 %tmp4.0.insert.ext to float
ret float %5
; CHECK-LABEL: NMul:
; CHECK-HARDFP-FULLFP16: vnmul.f16 s0, s0, s1
}
; 28. VRINTA
; 29. VRINTM
; 30. VRINTN
; 31. VRINTP
; 32. VRINTR
; 33. VRINTX
; 34. VRINTZ
; 35. VSELEQ
; 36. VSELGE
; 37. VSELGT
; 38. VSELVS
; 39. VSQRT
; 40. VSUB
define float @Sub(float %a.coerce, float %b.coerce) {
entry:
%0 = bitcast float %a.coerce to i32
%tmp.0.extract.trunc = trunc i32 %0 to i16
%1 = bitcast i16 %tmp.0.extract.trunc to half
%2 = bitcast float %b.coerce to i32
%tmp1.0.extract.trunc = trunc i32 %2 to i16
%3 = bitcast i16 %tmp1.0.extract.trunc to half
%add = fsub half %1, %3
%4 = bitcast half %add to i16
%tmp4.0.insert.ext = zext i16 %4 to i32
%5 = bitcast i32 %tmp4.0.insert.ext to float
ret float %5
; CHECK-LABEL: Sub:
; CHECK-SOFT: bl __aeabi_h2f
; CHECK-SOFT: bl __aeabi_h2f
; CHECK-SOFT: bl __aeabi_fsub
; CHECK-SOFT: bl __aeabi_f2h
; CHECK-SOFTFP-VFP3: bl __aeabi_h2f
; CHECK-SOFTFP-VFP3: bl __aeabi_h2f
; CHECK-SOFTFP-VFP3: vsub.f32
; CHECK-SOFTFP-VFP3: bl __aeabi_f2h
; CHECK-SOFTFP-FP16: vmov [[S2:s[0-9]]], r1
; CHECK-SOFTFP-FP16: vmov [[S0:s[0-9]]], r0
; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S2]], [[S2]]
; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S0]], [[S0]]
; CHECK-SOFTFP-FP16: vsub.f32 [[S0]], [[S0]], [[S2]]
; CHECK-SOFTFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]]
; CHECK-SOFTFP-FP16: vmov r0, s0
; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S0:s[0-9]]], r1
; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S2:s[0-9]]], r0
; CHECK-SOFTFP-FULLFP16: vsub.f16 [[S0]], [[S2]], [[S0]]
; CHECK-SOFTFP-FULLFP16-NEXT: vmov.f16 r0, s0
; CHECK-HARDFP-VFP3: vmov r{{.}}, s0
; CHECK-HARDFP-VFP3: vmov{{.*}}, s1
; CHECK-HARDFP-VFP3: bl __aeabi_h2f
; CHECK-HARDFP-VFP3: bl __aeabi_h2f
; CHECK-HARDFP-VFP3: vsub.f32
; CHECK-HARDFP-VFP3: bl __aeabi_f2h
; CHECK-HARDFP-VFP3: vmov s0, r0
; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S2:s[0-9]]], s1
; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S0:s[0-9]]], s0
; CHECK-HARDFP-FP16: vsub.f32 [[S0]], [[S0]], [[S2]]
; CHECK-HARDFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]]
; CHECK-HARDFP-FULLFP16: vsub.f16 s0, s0, s1
}