mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-23 03:02:36 +01:00
[ARM] Armv8.2-A FP16 code generation (part 3/3)
This adds most of the FP16 codegen support, but these areas need further work: - FP16 literals and immediates are not properly supported yet (e.g. literal pool needs work), - Instructions that are generated from intrinsics (e.g. vabs) haven't been added. This will be addressed in follow-up patches. Differential Revision: https://reviews.llvm.org/D42849 llvm-svn: 324321
This commit is contained in:
parent
5442996944
commit
7941278b18
@ -1042,6 +1042,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
|
||||
setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
|
||||
|
||||
setOperationAction(ISD::SETCC, MVT::i32, Expand);
|
||||
setOperationAction(ISD::SETCC, MVT::f16, Expand);
|
||||
setOperationAction(ISD::SETCC, MVT::f32, Expand);
|
||||
setOperationAction(ISD::SETCC, MVT::f64, Expand);
|
||||
setOperationAction(ISD::SELECT, MVT::i32, Custom);
|
||||
@ -12746,6 +12747,24 @@ bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ARMTargetLowering::isFNegFree(EVT VT) const {
|
||||
if (!VT.isSimple())
|
||||
return false;
|
||||
|
||||
// There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
|
||||
// negate values directly (fneg is free). So, we don't want to let the DAG
|
||||
// combiner rewrite fneg into xors and some other instructions. For f16 and
|
||||
// FullFP16 argument passing, some bitcast nodes may be introduced,
|
||||
// triggering this DAG combine rewrite, so we are avoiding that with this.
|
||||
switch (VT.getSimpleVT().SimpleTy) {
|
||||
default: break;
|
||||
case MVT::f16:
|
||||
return Subtarget->hasFullFP16();
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
|
||||
EVT VT = ExtVal.getValueType();
|
||||
|
||||
@ -13842,6 +13861,8 @@ bool ARM::isBitFieldInvertedMask(unsigned v) {
|
||||
bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
|
||||
if (!Subtarget->hasVFP3())
|
||||
return false;
|
||||
if (VT == MVT::f16 && Subtarget->hasFullFP16())
|
||||
return ARM_AM::getFP16Imm(Imm) != -1;
|
||||
if (VT == MVT::f32)
|
||||
return ARM_AM::getFP32Imm(Imm) != -1;
|
||||
if (VT == MVT::f64 && !Subtarget->isFPOnlySP())
|
||||
|
@ -331,6 +331,7 @@ class VectorType;
|
||||
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override;
|
||||
bool isTruncateFree(EVT SrcVT, EVT DstVT) const override;
|
||||
bool isZExtFree(SDValue Val, EVT VT2) const override;
|
||||
bool isFNegFree(EVT VT) const override;
|
||||
|
||||
bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
|
||||
|
||||
|
@ -395,9 +395,9 @@ def VDIVS : ASbI<0b11101, 0b00, 0, 0,
|
||||
|
||||
let TwoOperandAliasConstraint = "$Sn = $Sd" in
|
||||
def VDIVH : AHbI<0b11101, 0b00, 0, 0,
|
||||
(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
|
||||
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
|
||||
IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm",
|
||||
[]>,
|
||||
[(set HPR:$Sd, (fdiv HPR:$Sn, HPR:$Sm))]>,
|
||||
Sched<[WriteFPDIV32]>;
|
||||
|
||||
let TwoOperandAliasConstraint = "$Dn = $Dd" in
|
||||
@ -420,9 +420,9 @@ def VMULS : ASbIn<0b11100, 0b10, 0, 0,
|
||||
|
||||
let TwoOperandAliasConstraint = "$Sn = $Sd" in
|
||||
def VMULH : AHbI<0b11100, 0b10, 0, 0,
|
||||
(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
|
||||
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
|
||||
IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm",
|
||||
[]>,
|
||||
[(set HPR:$Sd, (fmul HPR:$Sn, HPR:$Sm))]>,
|
||||
Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>;
|
||||
|
||||
def VNMULD : ADbI<0b11100, 0b10, 1, 0,
|
||||
@ -442,9 +442,9 @@ def VNMULS : ASbI<0b11100, 0b10, 1, 0,
|
||||
}
|
||||
|
||||
def VNMULH : AHbI<0b11100, 0b10, 1, 0,
|
||||
(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
|
||||
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
|
||||
IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm",
|
||||
[]>,
|
||||
[(set HPR:$Sd, (fneg (fmul HPR:$Sn, HPR:$Sm)))]>,
|
||||
Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>;
|
||||
|
||||
multiclass vsel_inst<string op, bits<2> opc, int CC> {
|
||||
@ -525,9 +525,9 @@ def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
|
||||
}
|
||||
|
||||
def VCMPEH : AHuI<0b11101, 0b11, 0b0100, 0b11, 0,
|
||||
(outs), (ins SPR:$Sd, SPR:$Sm),
|
||||
(outs), (ins HPR:$Sd, HPR:$Sm),
|
||||
IIC_fpCMP16, "vcmpe", ".f16\t$Sd, $Sm",
|
||||
[]>;
|
||||
[(arm_cmpfp HPR:$Sd, HPR:$Sm, (i32 1))]>;
|
||||
|
||||
def VCMPD : ADuI<0b11101, 0b11, 0b0100, 0b01, 0,
|
||||
(outs), (ins DPR:$Dd, DPR:$Dm),
|
||||
@ -544,9 +544,9 @@ def VCMPS : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
|
||||
}
|
||||
|
||||
def VCMPH : AHuI<0b11101, 0b11, 0b0100, 0b01, 0,
|
||||
(outs), (ins SPR:$Sd, SPR:$Sm),
|
||||
(outs), (ins HPR:$Sd, HPR:$Sm),
|
||||
IIC_fpCMP16, "vcmp", ".f16\t$Sd, $Sm",
|
||||
[]>;
|
||||
[(arm_cmpfp HPR:$Sd, HPR:$Sm, (i32 0))]>;
|
||||
} // Defs = [FPSCR_NZCV]
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
@ -771,7 +771,7 @@ multiclass vcvt_inst<string opc, bits<2> rm,
|
||||
SDPatternOperator node = null_frag> {
|
||||
let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
|
||||
def SH : AHuInp<0b11101, 0b11, 0b1100, 0b11, 0,
|
||||
(outs SPR:$Sd), (ins SPR:$Sm),
|
||||
(outs SPR:$Sd), (ins HPR:$Sm),
|
||||
NoItinerary, !strconcat("vcvt", opc, ".s32.f16\t$Sd, $Sm"),
|
||||
[]>,
|
||||
Requires<[HasFullFP16]> {
|
||||
@ -779,7 +779,7 @@ multiclass vcvt_inst<string opc, bits<2> rm,
|
||||
}
|
||||
|
||||
def UH : AHuInp<0b11101, 0b11, 0b1100, 0b01, 0,
|
||||
(outs SPR:$Sd), (ins SPR:$Sm),
|
||||
(outs SPR:$Sd), (ins HPR:$Sm),
|
||||
NoItinerary, !strconcat("vcvt", opc, ".u32.f16\t$Sd, $Sm"),
|
||||
[]>,
|
||||
Requires<[HasFullFP16]> {
|
||||
@ -834,6 +834,17 @@ multiclass vcvt_inst<string opc, bits<2> rm,
|
||||
}
|
||||
|
||||
let Predicates = [HasFPARMv8] in {
|
||||
let Predicates = [HasFullFP16] in {
|
||||
def : Pat<(i32 (fp_to_sint (node HPR:$a))),
|
||||
(COPY_TO_REGCLASS
|
||||
(!cast<Instruction>(NAME#"SH") HPR:$a),
|
||||
GPR)>;
|
||||
|
||||
def : Pat<(i32 (fp_to_uint (node HPR:$a))),
|
||||
(COPY_TO_REGCLASS
|
||||
(!cast<Instruction>(NAME#"UH") HPR:$a),
|
||||
GPR)>;
|
||||
}
|
||||
def : Pat<(i32 (fp_to_sint (node SPR:$a))),
|
||||
(COPY_TO_REGCLASS
|
||||
(!cast<Instruction>(NAME#"SS") SPR:$a),
|
||||
@ -875,9 +886,9 @@ def VNEGS : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0,
|
||||
}
|
||||
|
||||
def VNEGH : AHuI<0b11101, 0b11, 0b0001, 0b01, 0,
|
||||
(outs SPR:$Sd), (ins SPR:$Sm),
|
||||
(outs HPR:$Sd), (ins HPR:$Sm),
|
||||
IIC_fpUNA16, "vneg", ".f16\t$Sd, $Sm",
|
||||
[]>;
|
||||
[(set HPR:$Sd, (fneg HPR:$Sm))]>;
|
||||
|
||||
multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> {
|
||||
def H : AHuI<0b11101, 0b11, 0b0110, 0b11, 0,
|
||||
@ -1313,13 +1324,16 @@ def : VFPNoNEONPat<(f32 (sint_to_fp (i32 (alignedload32 addrmode5:$a)))),
|
||||
(VSITOS (VLDRS addrmode5:$a))>;
|
||||
|
||||
def VSITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
|
||||
(outs SPR:$Sd), (ins SPR:$Sm),
|
||||
(outs HPR:$Sd), (ins SPR:$Sm),
|
||||
IIC_fpCVTIH, "vcvt", ".f16.s32\t$Sd, $Sm",
|
||||
[]>,
|
||||
Sched<[WriteFPCVT]> {
|
||||
let Inst{7} = 1; // s32
|
||||
}
|
||||
|
||||
def : VFPNoNEONPat<(f16 (sint_to_fp GPR:$a)),
|
||||
(VSITOH (COPY_TO_REGCLASS GPR:$a, SPR))>;
|
||||
|
||||
def VUITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
|
||||
(outs DPR:$Dd), (ins SPR:$Sm),
|
||||
IIC_fpCVTID, "vcvt", ".f64.u32\t$Dd, $Sm",
|
||||
@ -1355,13 +1369,16 @@ def : VFPNoNEONPat<(f32 (uint_to_fp (i32 (alignedload32 addrmode5:$a)))),
|
||||
(VUITOS (VLDRS addrmode5:$a))>;
|
||||
|
||||
def VUITOH : AVConv1IHs_Encode<0b11101, 0b11, 0b1000, 0b1001,
|
||||
(outs SPR:$Sd), (ins SPR:$Sm),
|
||||
(outs HPR:$Sd), (ins SPR:$Sm),
|
||||
IIC_fpCVTIH, "vcvt", ".f16.u32\t$Sd, $Sm",
|
||||
[]>,
|
||||
Sched<[WriteFPCVT]> {
|
||||
let Inst{7} = 0; // u32
|
||||
}
|
||||
|
||||
def : VFPNoNEONPat<(f16 (uint_to_fp GPR:$a)),
|
||||
(VUITOH (COPY_TO_REGCLASS GPR:$a, SPR))>;
|
||||
|
||||
// FP -> Int:
|
||||
|
||||
class AVConv1IsD_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
|
||||
@ -1456,13 +1473,16 @@ def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_sint (f32 SPR:$a))),
|
||||
(VSTRS (VTOSIZS SPR:$a), addrmode5:$ptr)>;
|
||||
|
||||
def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
|
||||
(outs SPR:$Sd), (ins SPR:$Sm),
|
||||
(outs SPR:$Sd), (ins HPR:$Sm),
|
||||
IIC_fpCVTHI, "vcvt", ".s32.f16\t$Sd, $Sm",
|
||||
[]>,
|
||||
Sched<[WriteFPCVT]> {
|
||||
let Inst{7} = 1; // Z bit
|
||||
}
|
||||
|
||||
def : VFPNoNEONPat<(i32 (fp_to_sint HPR:$a)),
|
||||
(COPY_TO_REGCLASS (VTOSIZH HPR:$a), GPR)>;
|
||||
|
||||
def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
|
||||
(outs SPR:$Sd), (ins DPR:$Dm),
|
||||
IIC_fpCVTDI, "vcvt", ".u32.f64\t$Sd, $Dm",
|
||||
@ -1499,13 +1519,16 @@ def : VFPNoNEONPat<(alignedstore32 (i32 (fp_to_uint (f32 SPR:$a))),
|
||||
(VSTRS (VTOUIZS SPR:$a), addrmode5:$ptr)>;
|
||||
|
||||
def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
|
||||
(outs SPR:$Sd), (ins SPR:$Sm),
|
||||
(outs SPR:$Sd), (ins HPR:$Sm),
|
||||
IIC_fpCVTHI, "vcvt", ".u32.f16\t$Sd, $Sm",
|
||||
[]>,
|
||||
Sched<[WriteFPCVT]> {
|
||||
let Inst{7} = 1; // Z bit
|
||||
}
|
||||
|
||||
def : VFPNoNEONPat<(i32 (fp_to_uint HPR:$a)),
|
||||
(COPY_TO_REGCLASS (VTOUIZH HPR:$a), GPR)>;
|
||||
|
||||
// And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
|
||||
let Uses = [FPSCR] in {
|
||||
def VTOSIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
|
||||
@ -1789,9 +1812,10 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
|
||||
}
|
||||
|
||||
def VMLAH : AHbI<0b11100, 0b00, 0, 0,
|
||||
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
|
||||
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
|
||||
IIC_fpMAC16, "vmla", ".f16\t$Sd, $Sn, $Sm",
|
||||
[]>,
|
||||
[(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm),
|
||||
HPR:$Sdin))]>,
|
||||
RegConstraint<"$Sdin = $Sd">,
|
||||
Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
|
||||
|
||||
@ -1801,6 +1825,10 @@ def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
|
||||
def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
|
||||
(VMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
|
||||
Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
|
||||
def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
|
||||
(VMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
|
||||
Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx,DontUseFusedMAC]>;
|
||||
|
||||
|
||||
def VMLSD : ADbI<0b11100, 0b00, 1, 0,
|
||||
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
|
||||
@ -1825,9 +1853,10 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
|
||||
}
|
||||
|
||||
def VMLSH : AHbI<0b11100, 0b00, 1, 0,
|
||||
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
|
||||
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
|
||||
IIC_fpMAC16, "vmls", ".f16\t$Sd, $Sn, $Sm",
|
||||
[]>,
|
||||
[(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
|
||||
HPR:$Sdin))]>,
|
||||
RegConstraint<"$Sdin = $Sd">,
|
||||
Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
|
||||
|
||||
@ -1837,6 +1866,9 @@ def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
|
||||
def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
|
||||
(VMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
|
||||
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
|
||||
def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
|
||||
(VMLSH HPR:$dstin, HPR:$a, HPR:$b)>,
|
||||
Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
|
||||
|
||||
def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
|
||||
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
|
||||
@ -1861,9 +1893,10 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
|
||||
}
|
||||
|
||||
def VNMLAH : AHbI<0b11100, 0b01, 1, 0,
|
||||
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
|
||||
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
|
||||
IIC_fpMAC16, "vnmla", ".f16\t$Sd, $Sn, $Sm",
|
||||
[]>,
|
||||
[(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
|
||||
HPR:$Sdin))]>,
|
||||
RegConstraint<"$Sdin = $Sd">,
|
||||
Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
|
||||
|
||||
@ -1874,6 +1907,9 @@ def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
|
||||
def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
|
||||
(VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
|
||||
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
|
||||
def : Pat<(fsub_mlx (fneg (fmul_su HPR:$a, HPR:$b)), HPR:$dstin),
|
||||
(VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
|
||||
Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
|
||||
|
||||
// (-dst - (a * b)) -> -(dst + (a * b))
|
||||
def : Pat<(fsub_mlx (fneg DPR:$dstin), (fmul_su DPR:$a, (f64 DPR:$b))),
|
||||
@ -1882,6 +1918,9 @@ def : Pat<(fsub_mlx (fneg DPR:$dstin), (fmul_su DPR:$a, (f64 DPR:$b))),
|
||||
def : Pat<(fsub_mlx (fneg SPR:$dstin), (fmul_su SPR:$a, SPR:$b)),
|
||||
(VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
|
||||
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
|
||||
def : Pat<(fsub_mlx (fneg HPR:$dstin), (fmul_su HPR:$a, HPR:$b)),
|
||||
(VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
|
||||
Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
|
||||
|
||||
def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
|
||||
(outs DPR:$Dd), (ins DPR:$Ddin, DPR:$Dn, DPR:$Dm),
|
||||
@ -1905,9 +1944,9 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
|
||||
}
|
||||
|
||||
def VNMLSH : AHbI<0b11100, 0b01, 0, 0,
|
||||
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
|
||||
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
|
||||
IIC_fpMAC16, "vnmls", ".f16\t$Sd, $Sn, $Sm",
|
||||
[]>,
|
||||
[(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>,
|
||||
RegConstraint<"$Sdin = $Sd">,
|
||||
Requires<[HasFullFP16,UseFPVMLx,DontUseFusedMAC]>;
|
||||
|
||||
@ -1917,6 +1956,9 @@ def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
|
||||
def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
|
||||
(VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
|
||||
Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
|
||||
def : Pat<(fsub_mlx (fmul_su HPR:$a, HPR:$b), HPR:$dstin),
|
||||
(VNMLSH HPR:$dstin, HPR:$a, HPR:$b)>,
|
||||
Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx,DontUseFusedMAC]>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Fused FP Multiply-Accumulate Operations.
|
||||
@ -1943,9 +1985,10 @@ def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
|
||||
}
|
||||
|
||||
def VFMAH : AHbI<0b11101, 0b10, 0, 0,
|
||||
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
|
||||
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
|
||||
IIC_fpFMAC16, "vfma", ".f16\t$Sd, $Sn, $Sm",
|
||||
[]>,
|
||||
[(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm),
|
||||
HPR:$Sdin))]>,
|
||||
RegConstraint<"$Sdin = $Sd">,
|
||||
Requires<[HasFullFP16,UseFusedMAC]>,
|
||||
Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
|
||||
@ -1956,6 +1999,9 @@ def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
|
||||
def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
|
||||
(VFMAS SPR:$dstin, SPR:$a, SPR:$b)>,
|
||||
Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
|
||||
def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
|
||||
(VFMAH HPR:$dstin, HPR:$a, HPR:$b)>,
|
||||
Requires<[HasFullFP16,DontUseNEONForFP,UseFusedMAC]>;
|
||||
|
||||
// Match @llvm.fma.* intrinsics
|
||||
// (fma x, y, z) -> (vfms z, x, y)
|
||||
@ -1988,9 +2034,10 @@ def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
|
||||
}
|
||||
|
||||
def VFMSH : AHbI<0b11101, 0b10, 1, 0,
|
||||
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
|
||||
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
|
||||
IIC_fpFMAC16, "vfms", ".f16\t$Sd, $Sn, $Sm",
|
||||
[]>,
|
||||
[(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
|
||||
HPR:$Sdin))]>,
|
||||
RegConstraint<"$Sdin = $Sd">,
|
||||
Requires<[HasFullFP16,UseFusedMAC]>,
|
||||
Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
|
||||
@ -2001,6 +2048,9 @@ def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
|
||||
def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
|
||||
(VFMSS SPR:$dstin, SPR:$a, SPR:$b)>,
|
||||
Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
|
||||
def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
|
||||
(VFMSH HPR:$dstin, HPR:$a, HPR:$b)>,
|
||||
Requires<[HasFullFP16,DontUseNEONForFP,UseFusedMAC]>;
|
||||
|
||||
// Match @llvm.fma.* intrinsics
|
||||
// (fma (fneg x), y, z) -> (vfms z, x, y)
|
||||
@ -2040,9 +2090,10 @@ def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
|
||||
}
|
||||
|
||||
def VFNMAH : AHbI<0b11101, 0b01, 1, 0,
|
||||
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
|
||||
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
|
||||
IIC_fpFMAC16, "vfnma", ".f16\t$Sd, $Sn, $Sm",
|
||||
[]>,
|
||||
[(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
|
||||
HPR:$Sdin))]>,
|
||||
RegConstraint<"$Sdin = $Sd">,
|
||||
Requires<[HasFullFP16,UseFusedMAC]>,
|
||||
Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
|
||||
@ -2091,9 +2142,9 @@ def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
|
||||
}
|
||||
|
||||
def VFNMSH : AHbI<0b11101, 0b01, 0, 0,
|
||||
(outs SPR:$Sd), (ins SPR:$Sdin, SPR:$Sn, SPR:$Sm),
|
||||
(outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
|
||||
IIC_fpFMAC16, "vfnms", ".f16\t$Sd, $Sn, $Sm",
|
||||
[]>,
|
||||
[(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>,
|
||||
RegConstraint<"$Sdin = $Sd">,
|
||||
Requires<[HasFullFP16,UseFusedMAC]>,
|
||||
Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
|
||||
|
@ -1,29 +1,44 @@
|
||||
; SOFT:
|
||||
; RUN: llc < %s -mtriple=arm-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT
|
||||
; RUN: llc < %s -mtriple=thumb-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT
|
||||
|
||||
; SOFTFP:
|
||||
; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+vfp3 | FileCheck %s --check-prefixes=CHECK,CHECK-SOFTFP-VFP3
|
||||
; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+vfp4 | FileCheck %s --check-prefixes=CHECK,CHECK-SOFTFP-FP16
|
||||
; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-SOFTFP-FULLFP16
|
||||
|
||||
; RUN: llc < %s -mtriple=thumbv7-none-eabi -mattr=+vfp3 | FileCheck %s --check-prefixes=CHECK,CHECK-SOFTFP-VFP3
|
||||
; RUN: llc < %s -mtriple=thumbv7-none-eabi -mattr=+vfp4 | FileCheck %s --check-prefixes=CHECK,CHECK-SOFTFP-FP16
|
||||
; RUN: llc < %s -mtriple=thumbv7-none-eabi -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-SOFTFP-FULLFP16
|
||||
|
||||
; HARD:
|
||||
; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+vfp3 | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-VFP3
|
||||
; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+vfp4 | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-FP16
|
||||
; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-FULLFP16
|
||||
|
||||
; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mattr=+vfp3 | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-VFP3
|
||||
; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mattr=+vfp4 | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-FP16
|
||||
; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-FULLFP16
|
||||
|
||||
define float @RetValBug(float %A.coerce) local_unnamed_addr {
|
||||
; FP-CONTRACT=FAST
|
||||
; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+fullfp16 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-FULLFP16-FAST
|
||||
; RUN: llc < %s -mtriple=thumbv7-none-eabihf -mattr=+fullfp16 -fp-contract=fast | FileCheck %s --check-prefixes=CHECK,CHECK-HARDFP-FULLFP16-FAST
|
||||
|
||||
|
||||
define float @RetValBug(float %A.coerce) {
|
||||
entry:
|
||||
ret float undef
|
||||
; This expression is optimised away due to the undef value. Check that
|
||||
; LowerReturn can handle undef nodes (i.e. nodes which do not have any
|
||||
; operands) when FullFP16 is enabled.
|
||||
; Check thatLowerReturn can handle undef nodes (i.e. nodes which do not have
|
||||
; any operands) when FullFP16 is enabled.
|
||||
;
|
||||
; CHECK-LABEL: RetValBug:
|
||||
; CHECK-HARDFP-FULLFP16: mov pc, lr
|
||||
; CHECK-HARDFP-FULLFP16: {{.*}} lr
|
||||
}
|
||||
|
||||
define float @Add(float %a.coerce, float %b.coerce) local_unnamed_addr {
|
||||
; 1. VABS: TODO
|
||||
|
||||
; 2. VADD
|
||||
define float @Add(float %a.coerce, float %b.coerce) {
|
||||
entry:
|
||||
%0 = bitcast float %a.coerce to i32
|
||||
%tmp.0.extract.trunc = trunc i32 %0 to i16
|
||||
@ -61,7 +76,6 @@ entry:
|
||||
; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S2:s[0-9]]], r0
|
||||
; CHECK-SOFTFP-FULLFP16: vadd.f16 [[S0]], [[S2]], [[S0]]
|
||||
; CHECK-SOFTFP-FULLFP16-NEXT: vmov.f16 r0, s0
|
||||
; CHECK-SOFTFP-FULLFP16-NEXT: mov pc, lr
|
||||
|
||||
; CHECK-HARDFP-VFP3: vmov r{{.}}, s0
|
||||
; CHECK-HARDFP-VFP3: vmov{{.*}}, s1
|
||||
@ -77,5 +91,549 @@ entry:
|
||||
; CHECK-HARDFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]]
|
||||
|
||||
; CHECK-HARDFP-FULLFP16: vadd.f16 s0, s0, s1
|
||||
; CHECK-HARDFP-FULLFP16-NEXT: mov pc, lr
|
||||
}
|
||||
|
||||
; 3. VCMP
|
||||
define zeroext i1 @VCMP(float %F.coerce, float %G.coerce) {
|
||||
entry:
|
||||
%0 = bitcast float %F.coerce to i32
|
||||
%tmp.0.extract.trunc = trunc i32 %0 to i16
|
||||
%1 = bitcast i16 %tmp.0.extract.trunc to half
|
||||
%2 = bitcast float %G.coerce to i32
|
||||
%tmp1.0.extract.trunc = trunc i32 %2 to i16
|
||||
%3 = bitcast i16 %tmp1.0.extract.trunc to half
|
||||
%cmp = fcmp ogt half %1, %3
|
||||
ret i1 %cmp
|
||||
|
||||
; CHECK-LABEL: VCMP:
|
||||
|
||||
; CHECK-SOFT: bl __aeabi_fcmpgt
|
||||
|
||||
; CHECK-SOFTFP-VFP3: bl __aeabi_h2f
|
||||
; CHECK-SOFTFP-VFP3: bl __aeabi_h2f
|
||||
; CHECK-SOFTFP-VFP3: vcmpe.f32 s{{.}}, s{{.}}
|
||||
|
||||
; CHECK-SOFTFP-FP16: vcvtb.f32.f16 s{{.}}, s{{.}}
|
||||
; CHECK-SOFTFP-FP16: vcvtb.f32.f16 s{{.}}, s{{.}}
|
||||
; CHECK-SOFTFP-FP16: vcmpe.f32 s{{.}}, s{{.}}
|
||||
|
||||
; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S2:s[0-9]]], r0
|
||||
; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S0:s[0-9]]], r1
|
||||
; CHECK-SOFTFP-FULLFP16: vcmpe.f16 [[S2]], [[S0]]
|
||||
|
||||
; CHECK-SOFTFP-FULLFP16-NOT: vmov.f16 s{{.}}, r0
|
||||
; CHECK-SOFTFP-FULLFP16-NOT: vmov.f16 s{{.}}, r1
|
||||
; CHECK-HARDFP-FULLFP16: vcmpe.f16 s0, s1
|
||||
}
|
||||
|
||||
; 4. VCMPE
|
||||
|
||||
; FIXME: enable when constant pool is fixed
|
||||
;
|
||||
;define i32 @VCMPE_IMM(float %F.coerce) {
|
||||
;entry:
|
||||
; %0 = bitcast float %F.coerce to i32
|
||||
; %tmp.0.extract.trunc = trunc i32 %0 to i16
|
||||
; %1 = bitcast i16 %tmp.0.extract.trunc to half
|
||||
; %tmp = fcmp olt half %1, 1.000000e+00
|
||||
; %tmp1 = zext i1 %tmp to i32
|
||||
; ret i32 %tmp1
|
||||
;}
|
||||
|
||||
define i32 @VCMPE(float %F.coerce, float %G.coerce) {
|
||||
entry:
|
||||
%0 = bitcast float %F.coerce to i32
|
||||
%tmp.0.extract.trunc = trunc i32 %0 to i16
|
||||
%1 = bitcast i16 %tmp.0.extract.trunc to half
|
||||
%2 = bitcast float %G.coerce to i32
|
||||
%tmp.1.extract.trunc = trunc i32 %2 to i16
|
||||
%3 = bitcast i16 %tmp.1.extract.trunc to half
|
||||
%tmp = fcmp olt half %1, %3
|
||||
%tmp1 = zext i1 %tmp to i32
|
||||
ret i32 %tmp1
|
||||
|
||||
; CHECK-LABEL: VCMPE:
|
||||
}
|
||||
|
||||
; 5. VCVT (between floating-point and fixed-point)
|
||||
; Only assembly/disassembly support
|
||||
|
||||
; 6. VCVT (between floating-point and integer, both directions)
|
||||
define i32 @fptosi(i32 %A.coerce) {
|
||||
entry:
|
||||
%tmp.0.extract.trunc = trunc i32 %A.coerce to i16
|
||||
%0 = bitcast i16 %tmp.0.extract.trunc to half
|
||||
%conv = fptosi half %0 to i32
|
||||
ret i32 %conv
|
||||
|
||||
; CHECK-LABEL: fptosi:
|
||||
|
||||
; CHECK-HARDFP-FULLFP16: vmov.f16 s0, r0
|
||||
; CHECK-HARDFP-FULLFP16-NEXT: vcvt.s32.f16 s0, s0
|
||||
; CHECK-HARDFP-FULLFP16-NEXT: vmov r0, s0
|
||||
}
|
||||
|
||||
define i32 @fptoui(i32 %A.coerce) {
|
||||
entry:
|
||||
%tmp.0.extract.trunc = trunc i32 %A.coerce to i16
|
||||
%0 = bitcast i16 %tmp.0.extract.trunc to half
|
||||
%conv = fptoui half %0 to i32
|
||||
ret i32 %conv
|
||||
|
||||
; CHECK-HARDFP-FULLFP16: vcvt.u32.f16 s0, s0
|
||||
; CHECK-HARDFP-FULLFP16-NEXT: vmov r0, s0
|
||||
}
|
||||
|
||||
define float @UintToH(i32 %a, i32 %b) {
|
||||
entry:
|
||||
%0 = uitofp i32 %a to half
|
||||
%1 = bitcast half %0 to i16
|
||||
%tmp0.insert.ext = zext i16 %1 to i32
|
||||
%2 = bitcast i32 %tmp0.insert.ext to float
|
||||
ret float %2
|
||||
|
||||
; CHECK-LABEL: UintToH:
|
||||
|
||||
; CHECK-HARDFP-FULLFP16: vmov s0, r0
|
||||
; CHECK-HARDFP-FULLFP16-NEXT: vcvt.f16.u32 s0, s0
|
||||
}
|
||||
|
||||
define float @SintToH(i32 %a, i32 %b) {
|
||||
entry:
|
||||
%0 = sitofp i32 %a to half
|
||||
%1 = bitcast half %0 to i16
|
||||
%tmp0.insert.ext = zext i16 %1 to i32
|
||||
%2 = bitcast i32 %tmp0.insert.ext to float
|
||||
ret float %2
|
||||
|
||||
; CHECK-LABEL: SintToH:
|
||||
|
||||
; CHECK-HARDFP-FULLFP16: vmov s0, r0
|
||||
; CHECK-HARDFP-FULLFP16-NEXT: vcvt.f16.s32 s0, s0
|
||||
}
|
||||
|
||||
; TODO:
|
||||
; 7. VCVTA
|
||||
; 8. VCVTM
|
||||
; 9. VCVTN
|
||||
; 10. VCVTP
|
||||
; 11. VCVTR
|
||||
|
||||
; 12. VDIV
|
||||
define float @Div(float %a.coerce, float %b.coerce) {
|
||||
entry:
|
||||
%0 = bitcast float %a.coerce to i32
|
||||
%tmp.0.extract.trunc = trunc i32 %0 to i16
|
||||
%1 = bitcast i16 %tmp.0.extract.trunc to half
|
||||
%2 = bitcast float %b.coerce to i32
|
||||
%tmp1.0.extract.trunc = trunc i32 %2 to i16
|
||||
%3 = bitcast i16 %tmp1.0.extract.trunc to half
|
||||
%add = fdiv half %1, %3
|
||||
%4 = bitcast half %add to i16
|
||||
%tmp4.0.insert.ext = zext i16 %4 to i32
|
||||
%5 = bitcast i32 %tmp4.0.insert.ext to float
|
||||
ret float %5
|
||||
|
||||
; CHECK-LABEL: Div:
|
||||
|
||||
; CHECK-SOFT: bl __aeabi_h2f
|
||||
; CHECK-SOFT: bl __aeabi_h2f
|
||||
; CHECK-SOFT: bl __aeabi_fdiv
|
||||
; CHECK-SOFT: bl __aeabi_f2h
|
||||
|
||||
; CHECK-SOFTFP-VFP3: bl __aeabi_h2f
|
||||
; CHECK-SOFTFP-VFP3: bl __aeabi_h2f
|
||||
; CHECK-SOFTFP-VFP3: vdiv.f32
|
||||
; CHECK-SOFTFP-VFP3: bl __aeabi_f2h
|
||||
|
||||
; CHECK-SOFTFP-FP16: vmov [[S2:s[0-9]]], r1
|
||||
; CHECK-SOFTFP-FP16: vmov [[S0:s[0-9]]], r0
|
||||
; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S2]], [[S2]]
|
||||
; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S0]], [[S0]]
|
||||
; CHECK-SOFTFP-FP16: vdiv.f32 [[S0]], [[S0]], [[S2]]
|
||||
; CHECK-SOFTFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]]
|
||||
; CHECK-SOFTFP-FP16: vmov r0, s0
|
||||
|
||||
; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S0:s[0-9]]], r1
|
||||
; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S2:s[0-9]]], r0
|
||||
; CHECK-SOFTFP-FULLFP16: vdiv.f16 [[S0]], [[S2]], [[S0]]
|
||||
; CHECK-SOFTFP-FULLFP16-NEXT: vmov.f16 r0, s0
|
||||
|
||||
; CHECK-HARDFP-VFP3: vmov r{{.}}, s0
|
||||
; CHECK-HARDFP-VFP3: vmov{{.*}}, s1
|
||||
; CHECK-HARDFP-VFP3: bl __aeabi_h2f
|
||||
; CHECK-HARDFP-VFP3: bl __aeabi_h2f
|
||||
; CHECK-HARDFP-VFP3: vdiv.f32
|
||||
; CHECK-HARDFP-VFP3: bl __aeabi_f2h
|
||||
; CHECK-HARDFP-VFP3: vmov s0, r0
|
||||
|
||||
; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S2:s[0-9]]], s1
|
||||
; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S0:s[0-9]]], s0
|
||||
; CHECK-HARDFP-FP16: vdiv.f32 [[S0]], [[S0]], [[S2]]
|
||||
; CHECK-HARDFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]]
|
||||
|
||||
; CHECK-HARDFP-FULLFP16: vdiv.f16 s0, s0, s1
|
||||
}
|
||||
|
||||
; 13. VFMA
|
||||
define float @VFMA(float %a.coerce, float %b.coerce, float %c.coerce) {
|
||||
entry:
|
||||
%0 = bitcast float %a.coerce to i32
|
||||
%tmp.0.extract.trunc = trunc i32 %0 to i16
|
||||
%1 = bitcast i16 %tmp.0.extract.trunc to half
|
||||
%2 = bitcast float %b.coerce to i32
|
||||
%tmp1.0.extract.trunc = trunc i32 %2 to i16
|
||||
%3 = bitcast i16 %tmp1.0.extract.trunc to half
|
||||
%4 = bitcast float %c.coerce to i32
|
||||
%tmp2.0.extract.trunc = trunc i32 %4 to i16
|
||||
%5 = bitcast i16 %tmp2.0.extract.trunc to half
|
||||
%mul = fmul half %1, %3
|
||||
%add = fadd half %mul, %5
|
||||
%6 = bitcast half %add to i16
|
||||
%tmp4.0.insert.ext = zext i16 %6 to i32
|
||||
%7 = bitcast i32 %tmp4.0.insert.ext to float
|
||||
ret float %7
|
||||
|
||||
; CHECK-LABEL: VFMA:
|
||||
; CHECK-HARDFP-FULLFP16-FAST: vfma.f16 s2, s0, s1
|
||||
; CHECK-HARDFP-FULLFP16-FAST-NEXT: vmov.f32 s0, s2
|
||||
}
|
||||
|
||||
; 14. VFMS
|
||||
define float @VFMS(float %a.coerce, float %b.coerce, float %c.coerce) {
|
||||
entry:
|
||||
%0 = bitcast float %a.coerce to i32
|
||||
%tmp.0.extract.trunc = trunc i32 %0 to i16
|
||||
%1 = bitcast i16 %tmp.0.extract.trunc to half
|
||||
%2 = bitcast float %b.coerce to i32
|
||||
%tmp1.0.extract.trunc = trunc i32 %2 to i16
|
||||
%3 = bitcast i16 %tmp1.0.extract.trunc to half
|
||||
%4 = bitcast float %c.coerce to i32
|
||||
%tmp2.0.extract.trunc = trunc i32 %4 to i16
|
||||
%5 = bitcast i16 %tmp2.0.extract.trunc to half
|
||||
%mul = fmul half %1, %3
|
||||
%sub = fsub half %5, %mul
|
||||
%6 = bitcast half %sub to i16
|
||||
%tmp4.0.insert.ext = zext i16 %6 to i32
|
||||
%7 = bitcast i32 %tmp4.0.insert.ext to float
|
||||
ret float %7
|
||||
|
||||
; CHECK-LABEL: VFMS:
|
||||
; CHECK-HARDFP-FULLFP16-FAST: vfms.f16 s2, s0, s1
|
||||
; CHECK-HARDFP-FULLFP16-FAST-NEXT: vmov.f32 s0, s2
|
||||
}
|
||||
|
||||
; 15. VFNMA
|
||||
define float @VFNMA(float %a.coerce, float %b.coerce, float %c.coerce) {
|
||||
entry:
|
||||
%0 = bitcast float %a.coerce to i32
|
||||
%tmp.0.extract.trunc = trunc i32 %0 to i16
|
||||
%1 = bitcast i16 %tmp.0.extract.trunc to half
|
||||
%2 = bitcast float %b.coerce to i32
|
||||
%tmp1.0.extract.trunc = trunc i32 %2 to i16
|
||||
%3 = bitcast i16 %tmp1.0.extract.trunc to half
|
||||
%4 = bitcast float %c.coerce to i32
|
||||
%tmp2.0.extract.trunc = trunc i32 %4 to i16
|
||||
%5 = bitcast i16 %tmp2.0.extract.trunc to half
|
||||
%mul = fmul half %1, %3
|
||||
%sub = fsub half -0.0, %mul
|
||||
%sub2 = fsub half %sub, %5
|
||||
%6 = bitcast half %sub2 to i16
|
||||
%tmp4.0.insert.ext = zext i16 %6 to i32
|
||||
%7 = bitcast i32 %tmp4.0.insert.ext to float
|
||||
ret float %7
|
||||
|
||||
; CHECK-LABEL: VFNMA:
|
||||
; CHECK-HARDFP-FULLFP16-FAST: vfnma.f16 s2, s0, s1
|
||||
; CHECK-HARDFP-FULLFP16-FAST-NEXT: vmov.f32 s0, s2
|
||||
}
|
||||
|
||||
; 16. VFNMS
|
||||
define float @VFNMS(float %a.coerce, float %b.coerce, float %c.coerce) {
|
||||
entry:
|
||||
%0 = bitcast float %a.coerce to i32
|
||||
%tmp.0.extract.trunc = trunc i32 %0 to i16
|
||||
%1 = bitcast i16 %tmp.0.extract.trunc to half
|
||||
%2 = bitcast float %b.coerce to i32
|
||||
%tmp1.0.extract.trunc = trunc i32 %2 to i16
|
||||
%3 = bitcast i16 %tmp1.0.extract.trunc to half
|
||||
%4 = bitcast float %c.coerce to i32
|
||||
%tmp2.0.extract.trunc = trunc i32 %4 to i16
|
||||
%5 = bitcast i16 %tmp2.0.extract.trunc to half
|
||||
%mul = fmul half %1, %3
|
||||
%sub2 = fsub half %mul, %5
|
||||
%6 = bitcast half %sub2 to i16
|
||||
%tmp4.0.insert.ext = zext i16 %6 to i32
|
||||
%7 = bitcast i32 %tmp4.0.insert.ext to float
|
||||
ret float %7
|
||||
|
||||
; CHECK-LABEL: VFNMS:
|
||||
; CHECK-HARDFP-FULLFP16-FAST: vfnms.f16 s2, s0, s1
|
||||
; CHECK-HARDFP-FULLFP16-FAST-NEXT: vmov.f32 s0, s2
|
||||
}
|
||||
|
||||
; TODO:
|
||||
; 17. VMAXNM
|
||||
; 18. VMINNM
|
||||
|
||||
; 19. VMLA
|
||||
define float @VMLA(float %a.coerce, float %b.coerce, float %c.coerce) {
|
||||
entry:
|
||||
%0 = bitcast float %a.coerce to i32
|
||||
%tmp.0.extract.trunc = trunc i32 %0 to i16
|
||||
%1 = bitcast i16 %tmp.0.extract.trunc to half
|
||||
%2 = bitcast float %b.coerce to i32
|
||||
%tmp1.0.extract.trunc = trunc i32 %2 to i16
|
||||
%3 = bitcast i16 %tmp1.0.extract.trunc to half
|
||||
%4 = bitcast float %c.coerce to i32
|
||||
%tmp2.0.extract.trunc = trunc i32 %4 to i16
|
||||
%5 = bitcast i16 %tmp2.0.extract.trunc to half
|
||||
%mul = fmul half %1, %3
|
||||
%add = fadd half %5, %mul
|
||||
%6 = bitcast half %add to i16
|
||||
%tmp4.0.insert.ext = zext i16 %6 to i32
|
||||
%7 = bitcast i32 %tmp4.0.insert.ext to float
|
||||
ret float %7
|
||||
|
||||
; CHECK-LABEL: VMLA:
|
||||
; CHECK-HARDFP-FULLFP16: vmla.f16 s2, s0, s1
|
||||
; CHECK-HARDFP-FULLFP16-NEXT: vmov.f32 s0, s2
|
||||
}
|
||||
|
||||
; 20. VMLS
|
||||
define float @VMLS(float %a.coerce, float %b.coerce, float %c.coerce) {
|
||||
entry:
|
||||
%0 = bitcast float %a.coerce to i32
|
||||
%tmp.0.extract.trunc = trunc i32 %0 to i16
|
||||
%1 = bitcast i16 %tmp.0.extract.trunc to half
|
||||
%2 = bitcast float %b.coerce to i32
|
||||
%tmp1.0.extract.trunc = trunc i32 %2 to i16
|
||||
%3 = bitcast i16 %tmp1.0.extract.trunc to half
|
||||
%4 = bitcast float %c.coerce to i32
|
||||
%tmp2.0.extract.trunc = trunc i32 %4 to i16
|
||||
%5 = bitcast i16 %tmp2.0.extract.trunc to half
|
||||
%mul = fmul half %1, %3
|
||||
%add = fsub half %5, %mul
|
||||
%6 = bitcast half %add to i16
|
||||
%tmp4.0.insert.ext = zext i16 %6 to i32
|
||||
%7 = bitcast i32 %tmp4.0.insert.ext to float
|
||||
ret float %7
|
||||
|
||||
; CHECK-LABEL: VMLS:
|
||||
; CHECK-HARDFP-FULLFP16: vmls.f16 s2, s0, s1
|
||||
; CHECK-HARDFP-FULLFP16-NEXT: vmov.f32 s0, s2
|
||||
}
|
||||
|
||||
; TODO: fix immediates.
|
||||
; 21. VMOV (between general-purpose register and half-precision register)
|
||||
; 22. VMOV (immediate)
|
||||
|
||||
; 23. VMUL
|
||||
define float @Mul(float %a.coerce, float %b.coerce) {
|
||||
entry:
|
||||
%0 = bitcast float %a.coerce to i32
|
||||
%tmp.0.extract.trunc = trunc i32 %0 to i16
|
||||
%1 = bitcast i16 %tmp.0.extract.trunc to half
|
||||
%2 = bitcast float %b.coerce to i32
|
||||
%tmp1.0.extract.trunc = trunc i32 %2 to i16
|
||||
%3 = bitcast i16 %tmp1.0.extract.trunc to half
|
||||
%add = fmul half %1, %3
|
||||
%4 = bitcast half %add to i16
|
||||
%tmp4.0.insert.ext = zext i16 %4 to i32
|
||||
%5 = bitcast i32 %tmp4.0.insert.ext to float
|
||||
ret float %5
|
||||
|
||||
; CHECK-LABEL: Mul:
|
||||
|
||||
; CHECK-SOFT: bl __aeabi_h2f
|
||||
; CHECK-SOFT: bl __aeabi_h2f
|
||||
; CHECK-SOFT: bl __aeabi_fmul
|
||||
; CHECK-SOFT: bl __aeabi_f2h
|
||||
|
||||
; CHECK-SOFTFP-VFP3: bl __aeabi_h2f
|
||||
; CHECK-SOFTFP-VFP3: bl __aeabi_h2f
|
||||
; CHECK-SOFTFP-VFP3: vmul.f32
|
||||
; CHECK-SOFTFP-VFP3: bl __aeabi_f2h
|
||||
|
||||
; CHECK-SOFTFP-FP16: vmov [[S2:s[0-9]]], r1
|
||||
; CHECK-SOFTFP-FP16: vmov [[S0:s[0-9]]], r0
|
||||
; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S2]], [[S2]]
|
||||
; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S0]], [[S0]]
|
||||
; CHECK-SOFTFP-FP16: vmul.f32 [[S0]], [[S0]], [[S2]]
|
||||
; CHECK-SOFTFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]]
|
||||
; CHECK-SOFTFP-FP16: vmov r0, s0
|
||||
|
||||
; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S0:s[0-9]]], r1
|
||||
; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S2:s[0-9]]], r0
|
||||
; CHECK-SOFTFP-FULLFP16: vmul.f16 [[S0]], [[S2]], [[S0]]
|
||||
; CHECK-SOFTFP-FULLFP16-NEXT: vmov.f16 r0, s0
|
||||
|
||||
; CHECK-HARDFP-VFP3: vmov r{{.}}, s0
|
||||
; CHECK-HARDFP-VFP3: vmov{{.*}}, s1
|
||||
; CHECK-HARDFP-VFP3: bl __aeabi_h2f
|
||||
; CHECK-HARDFP-VFP3: bl __aeabi_h2f
|
||||
; CHECK-HARDFP-VFP3: vmul.f32
|
||||
; CHECK-HARDFP-VFP3: bl __aeabi_f2h
|
||||
; CHECK-HARDFP-VFP3: vmov s0, r0
|
||||
|
||||
; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S2:s[0-9]]], s1
|
||||
; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S0:s[0-9]]], s0
|
||||
; CHECK-HARDFP-FP16: vmul.f32 [[S0]], [[S0]], [[S2]]
|
||||
; CHECK-HARDFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]]
|
||||
|
||||
; CHECK-HARDFP-FULLFP16: vmul.f16 s0, s0, s1
|
||||
}
|
||||
|
||||
; 24. VNEG
|
||||
define float @Neg(float %a.coerce) {
|
||||
entry:
|
||||
%0 = bitcast float %a.coerce to i32
|
||||
%tmp.0.extract.trunc = trunc i32 %0 to i16
|
||||
%1 = bitcast i16 %tmp.0.extract.trunc to half
|
||||
%2 = fsub half -0.000000e+00, %1
|
||||
%3 = bitcast half %2 to i16
|
||||
%tmp4.0.insert.ext = zext i16 %3 to i32
|
||||
%4 = bitcast i32 %tmp4.0.insert.ext to float
|
||||
ret float %4
|
||||
|
||||
; CHECK-LABEL: Neg:
|
||||
; CHECK-HARDFP-FULLFP16: vneg.f16 s0, s0
|
||||
}
|
||||
|
||||
; 25. VNMLA
|
||||
define float @VNMLA(float %a.coerce, float %b.coerce, float %c.coerce) {
|
||||
entry:
|
||||
%0 = bitcast float %a.coerce to i32
|
||||
%tmp.0.extract.trunc = trunc i32 %0 to i16
|
||||
%1 = bitcast i16 %tmp.0.extract.trunc to half
|
||||
%2 = bitcast float %b.coerce to i32
|
||||
%tmp1.0.extract.trunc = trunc i32 %2 to i16
|
||||
%3 = bitcast i16 %tmp1.0.extract.trunc to half
|
||||
%4 = bitcast float %c.coerce to i32
|
||||
%tmp2.0.extract.trunc = trunc i32 %4 to i16
|
||||
%5 = bitcast i16 %tmp2.0.extract.trunc to half
|
||||
%add = fmul half %1, %3
|
||||
%add2 = fsub half -0.000000e+00, %add
|
||||
%add3 = fsub half %add2, %5
|
||||
%6 = bitcast half %add3 to i16
|
||||
%tmp4.0.insert.ext = zext i16 %6 to i32
|
||||
%7 = bitcast i32 %tmp4.0.insert.ext to float
|
||||
ret float %7
|
||||
|
||||
; CHECK-LABEL: VNMLA:
|
||||
; CHECK-HARDFP-FULLFP16: vnmla.f16 s2, s0, s1
|
||||
; CHECK-HARDFP-FULLFP16: vmov.f32 s0, s2
|
||||
}
|
||||
|
||||
; 26. VNMLS
|
||||
define float @VNMLS(float %a.coerce, float %b.coerce, float %c.coerce) {
|
||||
entry:
|
||||
%0 = bitcast float %a.coerce to i32
|
||||
%tmp.0.extract.trunc = trunc i32 %0 to i16
|
||||
%1 = bitcast i16 %tmp.0.extract.trunc to half
|
||||
%2 = bitcast float %b.coerce to i32
|
||||
%tmp1.0.extract.trunc = trunc i32 %2 to i16
|
||||
%3 = bitcast i16 %tmp1.0.extract.trunc to half
|
||||
%4 = bitcast float %c.coerce to i32
|
||||
%tmp2.0.extract.trunc = trunc i32 %4 to i16
|
||||
%5 = bitcast i16 %tmp2.0.extract.trunc to half
|
||||
%add = fmul half %1, %3
|
||||
%add2 = fsub half %add, %5
|
||||
%6 = bitcast half %add2 to i16
|
||||
%tmp4.0.insert.ext = zext i16 %6 to i32
|
||||
%7 = bitcast i32 %tmp4.0.insert.ext to float
|
||||
ret float %7
|
||||
|
||||
; CHECK-LABEL: VNMLS:
|
||||
; CHECK-HARDFP-FULLFP16: vnmls.f16 s2, s0, s1
|
||||
; CHECK-HARDFP-FULLFP16: vmov.f32 s0, s2
|
||||
}
|
||||
|
||||
; 27. VNMUL
|
||||
define float @NMul(float %a.coerce, float %b.coerce) {
|
||||
entry:
|
||||
%0 = bitcast float %a.coerce to i32
|
||||
%tmp.0.extract.trunc = trunc i32 %0 to i16
|
||||
%1 = bitcast i16 %tmp.0.extract.trunc to half
|
||||
%2 = bitcast float %b.coerce to i32
|
||||
%tmp1.0.extract.trunc = trunc i32 %2 to i16
|
||||
%3 = bitcast i16 %tmp1.0.extract.trunc to half
|
||||
%add = fmul half %1, %3
|
||||
%add2 = fsub half -0.0, %add
|
||||
%4 = bitcast half %add2 to i16
|
||||
%tmp4.0.insert.ext = zext i16 %4 to i32
|
||||
%5 = bitcast i32 %tmp4.0.insert.ext to float
|
||||
ret float %5
|
||||
|
||||
; CHECK-LABEL: NMul:
|
||||
; CHECK-HARDFP-FULLFP16: vnmul.f16 s0, s0, s1
|
||||
}
|
||||
|
||||
; 28. VRINTA
|
||||
; 29. VRINTM
|
||||
; 30. VRINTN
|
||||
; 31. VRINTP
|
||||
; 32. VRINTR
|
||||
; 33. VRINTX
|
||||
; 34. VRINTZ
|
||||
; 35. VSELEQ
|
||||
; 36. VSELGE
|
||||
; 37. VSELGT
|
||||
; 38. VSELVS
|
||||
; 39. VSQRT
|
||||
|
||||
; 40. VSUB
|
||||
define float @Sub(float %a.coerce, float %b.coerce) {
|
||||
entry:
|
||||
%0 = bitcast float %a.coerce to i32
|
||||
%tmp.0.extract.trunc = trunc i32 %0 to i16
|
||||
%1 = bitcast i16 %tmp.0.extract.trunc to half
|
||||
%2 = bitcast float %b.coerce to i32
|
||||
%tmp1.0.extract.trunc = trunc i32 %2 to i16
|
||||
%3 = bitcast i16 %tmp1.0.extract.trunc to half
|
||||
%add = fsub half %1, %3
|
||||
%4 = bitcast half %add to i16
|
||||
%tmp4.0.insert.ext = zext i16 %4 to i32
|
||||
%5 = bitcast i32 %tmp4.0.insert.ext to float
|
||||
ret float %5
|
||||
|
||||
; CHECK-LABEL: Sub:
|
||||
|
||||
; CHECK-SOFT: bl __aeabi_h2f
|
||||
; CHECK-SOFT: bl __aeabi_h2f
|
||||
; CHECK-SOFT: bl __aeabi_fsub
|
||||
; CHECK-SOFT: bl __aeabi_f2h
|
||||
|
||||
; CHECK-SOFTFP-VFP3: bl __aeabi_h2f
|
||||
; CHECK-SOFTFP-VFP3: bl __aeabi_h2f
|
||||
; CHECK-SOFTFP-VFP3: vsub.f32
|
||||
; CHECK-SOFTFP-VFP3: bl __aeabi_f2h
|
||||
|
||||
; CHECK-SOFTFP-FP16: vmov [[S2:s[0-9]]], r1
|
||||
; CHECK-SOFTFP-FP16: vmov [[S0:s[0-9]]], r0
|
||||
; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S2]], [[S2]]
|
||||
; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S0]], [[S0]]
|
||||
; CHECK-SOFTFP-FP16: vsub.f32 [[S0]], [[S0]], [[S2]]
|
||||
; CHECK-SOFTFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]]
|
||||
; CHECK-SOFTFP-FP16: vmov r0, s0
|
||||
|
||||
; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S0:s[0-9]]], r1
|
||||
; CHECK-SOFTFP-FULLFP16: vmov.f16 [[S2:s[0-9]]], r0
|
||||
; CHECK-SOFTFP-FULLFP16: vsub.f16 [[S0]], [[S2]], [[S0]]
|
||||
; CHECK-SOFTFP-FULLFP16-NEXT: vmov.f16 r0, s0
|
||||
|
||||
; CHECK-HARDFP-VFP3: vmov r{{.}}, s0
|
||||
; CHECK-HARDFP-VFP3: vmov{{.*}}, s1
|
||||
; CHECK-HARDFP-VFP3: bl __aeabi_h2f
|
||||
; CHECK-HARDFP-VFP3: bl __aeabi_h2f
|
||||
; CHECK-HARDFP-VFP3: vsub.f32
|
||||
; CHECK-HARDFP-VFP3: bl __aeabi_f2h
|
||||
; CHECK-HARDFP-VFP3: vmov s0, r0
|
||||
|
||||
; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S2:s[0-9]]], s1
|
||||
; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S0:s[0-9]]], s0
|
||||
; CHECK-HARDFP-FP16: vsub.f32 [[S0]], [[S0]], [[S2]]
|
||||
; CHECK-HARDFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]]
|
||||
|
||||
; CHECK-HARDFP-FULLFP16: vsub.f16 s0, s0, s1
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user