1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-24 11:42:57 +01:00

[ARM] Armv8.2-A FP16 code generation (part 1/3)

This is the groundwork for Armv8.2-A FP16 code generation .

Clang passes and returns _Float16 values as floats, together with the required
bitconverts and truncs etc. to implement correct AAPCS behaviour, see D42318.
We will implement half-precision argument passing/returning lowering in the ARM
backend soon, but for now this means that this:

_Float16 sub(_Float16 a, _Float16 b) {
  return a + b;
}

gets lowered to this:

define float @sub(float %a.coerce, float %b.coerce) {
entry:
  %0 = bitcast float %a.coerce to i32
  %tmp.0.extract.trunc = trunc i32 %0 to i16
  %1 = bitcast i16 %tmp.0.extract.trunc to half
  <SNIP>
  %add = fadd half %1, %3
  <SNIP>
}

When FullFP16 is *not* supported, we don't make f16 a legal type, and we get
legalization for "free", i.e. nothing changes and everything works as before.
And also f16 argument passing/returning is handled.

When FullFP16 is supported, we do make f16 a legal type, and have 2 places that
we need to patch up: f16 argument passing and returning, which involves minor
tweaks to avoid unnecessary code generation for some bitcasts.

As a "demonstrator" that this works for the different FP16, FullFP16, softfp
modes, etc., I've added match rules to the VSUB instruction description showing
that we can codegen this instruction from IR, but more importantly, also to
some conversion instructions. These conversions were causing issue before in
the FP16 and FullFP16 cases.

I've also added match rules to the VLDRH and VSTRH desriptions, so that we can
actually compile the entire half-precision sub code example above. This showed
that these loads and stores had the wrong addressing mode specified: AddrMode5
instead of AddrMode5FP16, which turned out not be implemented at all, so that
has also been added.

This is the minimal patch that shows all the different moving parts. In patch
2/3 I will add some efficient lowering of bitcasts, and in 2/3 I will add the
remaining Armv8.2-A FP16 instruction descriptions.


Thanks to Sam Parker and Oliver Stannard for their help and reviews!


Differential Revision: https://reviews.llvm.org/D38315

llvm-svn: 323512
This commit is contained in:
Sjoerd Meijer 2018-01-26 09:26:40 +00:00
parent 7f54536b89
commit 77cd5c40d2
11 changed files with 239 additions and 29 deletions

View File

@ -2409,6 +2409,14 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
NumBits = 8;
Scale = 4;
break;
case ARMII::AddrMode5FP16:
ImmIdx = FrameRegIdx+1;
InstrOffs = ARM_AM::getAM5Offset(MI.getOperand(ImmIdx).getImm());
if (ARM_AM::getAM5Op(MI.getOperand(ImmIdx).getImm()) == ARM_AM::sub)
InstrOffs *= -1;
NumBits = 8;
Scale = 2;
break;
default:
llvm_unreachable("Unsupported addressing mode!");
}

View File

@ -187,6 +187,7 @@ def RetCC_ARM_AAPCS : CallingConv<[
CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
CCIfType<[f32], CCBitConvertToType<i32>>,
CCDelegateTo<RetCC_ARM_AAPCS_Common>
]>;
@ -233,7 +234,7 @@ def RetCC_ARM_AAPCS_VFP : CallingConv<[
CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
S9, S10, S11, S12, S13, S14, S15]>>,
S9, S10, S11, S12, S13, S14, S15]>>,
CCDelegateTo<RetCC_ARM_AAPCS_Common>
]>;

View File

@ -118,8 +118,10 @@ public:
SDValue &Offset, SDValue &Opc);
bool SelectAddrMode3Offset(SDNode *Op, SDValue N,
SDValue &Offset, SDValue &Opc);
bool SelectAddrMode5(SDValue N, SDValue &Base,
SDValue &Offset);
bool IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offset,
int Lwb, int Upb, bool FP16);
bool SelectAddrMode5(SDValue N, SDValue &Base, SDValue &Offset);
bool SelectAddrMode5FP16(SDValue N, SDValue &Base, SDValue &Offset);
bool SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,SDValue &Align);
bool SelectAddrMode6Offset(SDNode *Op, SDValue N, SDValue &Offset);
@ -886,8 +888,8 @@ bool ARMDAGToDAGISel::SelectAddrMode3Offset(SDNode *Op, SDValue N,
return true;
}
bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
SDValue &Base, SDValue &Offset) {
bool ARMDAGToDAGISel::IsAddressingMode5(SDValue N, SDValue &Base, SDValue &Offset,
int Lwb, int Upb, bool FP16) {
if (!CurDAG->isBaseWithConstantOffset(N)) {
Base = N;
if (N.getOpcode() == ISD::FrameIndex) {
@ -907,8 +909,9 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
// If the RHS is +/- imm8, fold into addr mode.
int RHSC;
if (isScaledConstantInRange(N.getOperand(1), /*Scale=*/4,
-256 + 1, 256, RHSC)) {
const int Scale = FP16 ? 2 : 4;
if (isScaledConstantInRange(N.getOperand(1), Scale, Lwb, Upb, RHSC)) {
Base = N.getOperand(0);
if (Base.getOpcode() == ISD::FrameIndex) {
int FI = cast<FrameIndexSDNode>(Base)->getIndex();
@ -921,17 +924,43 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
AddSub = ARM_AM::sub;
RHSC = -RHSC;
}
Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC),
SDLoc(N), MVT::i32);
if (FP16)
Offset = CurDAG->getTargetConstant(ARM_AM::getAM5FP16Opc(AddSub, RHSC),
SDLoc(N), MVT::i32);
else
Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(AddSub, RHSC),
SDLoc(N), MVT::i32);
return true;
}
Base = N;
Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
SDLoc(N), MVT::i32);
if (FP16)
Offset = CurDAG->getTargetConstant(ARM_AM::getAM5FP16Opc(ARM_AM::add, 0),
SDLoc(N), MVT::i32);
else
Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
SDLoc(N), MVT::i32);
return true;
}
bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
SDValue &Base, SDValue &Offset) {
int Lwb = -256 + 1;
int Upb = 256;
return IsAddressingMode5(N, Base, Offset, Lwb, Upb, /*FP16=*/ false);
}
bool ARMDAGToDAGISel::SelectAddrMode5FP16(SDValue N,
SDValue &Base, SDValue &Offset) {
int Lwb = -512 + 1;
int Upb = 512;
return IsAddressingMode5(N, Base, Offset, Lwb, Upb, /*FP16=*/ true);
}
bool ARMDAGToDAGISel::SelectAddrMode6(SDNode *Parent, SDValue N, SDValue &Addr,
SDValue &Align) {
Addr = N;

View File

@ -522,6 +522,13 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::f64, &ARM::DPRRegClass);
}
if (Subtarget->hasFullFP16()) {
addRegisterClass(MVT::f16, &ARM::HPRRegClass);
// Clean up bitcast of incoming arguments if hard float abi is enabled.
if (Subtarget->isTargetHardFloat())
setOperationAction(ISD::BITCAST, MVT::i16, Custom);
}
for (MVT VT : MVT::vector_valuetypes()) {
for (MVT InnerVT : MVT::vector_valuetypes()) {
setTruncStoreAction(VT, InnerVT, Expand);
@ -2474,12 +2481,37 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
assert(VA.isRegLoc() && "Can only return in registers!");
SDValue Arg = OutVals[realRVLocIdx];
bool ReturnF16 = false;
if (Subtarget->hasFullFP16() && Subtarget->isTargetHardFloat()) {
// Half-precision return values can be returned like this:
//
// t11 f16 = fadd ...
// t12: i16 = bitcast t11
// t13: i32 = zero_extend t12
// t14: f32 = bitcast t13
//
// to avoid code generation for bitcasts, we simply set Arg to the node
// that produces the f16 value, t11 in this case.
//
if (Arg.getValueType() == MVT::f32) {
SDValue ZE = Arg.getOperand(0);
if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
SDValue BC = ZE.getOperand(0);
if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
Arg = BC.getOperand(0);
ReturnF16 = true;
}
}
}
}
switch (VA.getLocInfo()) {
default: llvm_unreachable("Unknown loc info!");
case CCValAssign::Full: break;
case CCValAssign::BCvt:
Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
if (!ReturnF16)
Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
break;
}
@ -2527,7 +2559,8 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// Guarantee that all emitted copies are
// stuck together, avoiding something bad.
Flag = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
RetOps.push_back(DAG.getRegister(VA.getLocReg(),
ReturnF16 ? MVT::f16 : VA.getLocVT()));
}
const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
const MCPhysReg *I =
@ -3684,7 +3717,10 @@ SDValue ARMTargetLowering::LowerFormalArguments(
} else {
const TargetRegisterClass *RC;
if (RegVT == MVT::f32)
if (RegVT == MVT::f16)
RC = &ARM::HPRRegClass;
else if (RegVT == MVT::f32)
RC = &ARM::SPRRegClass;
else if (RegVT == MVT::f64)
RC = &ARM::DPRRegClass;
@ -5024,6 +5060,37 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
// source or destination of the bit convert.
EVT SrcVT = Op.getValueType();
EVT DstVT = N->getValueType(0);
// Half-precision arguments can be passed in like this:
//
// t4: f32,ch = CopyFromReg t0, Register:f32 %1
// t8: i32 = bitcast t4
// t9: i16 = truncate t8
// t10: f16 = bitcast t9 <~~~~ SDNode N
//
// but we want to avoid code generation for the bitcast, so transform this
// into:
//
// t18: f16 = CopyFromReg t0, Register:f32 %0
//
if (SrcVT == MVT::i16 && DstVT == MVT::f16) {
if (Op.getOpcode() != ISD::TRUNCATE)
return SDValue();
SDValue Bitcast = Op.getOperand(0);
if (Bitcast.getOpcode() != ISD::BITCAST ||
Bitcast.getValueType() != MVT::i32)
return SDValue();
SDValue Copy = Bitcast.getOperand(0);
if (Copy.getOpcode() != ISD::CopyFromReg ||
Copy.getValueType() != MVT::f32)
return SDValue();
SDValue Ops[] = { Copy->getOperand(0), Copy->getOperand(1) };
return DAG.getNode(ISD::CopyFromReg, SDLoc(Copy), MVT::f16, Ops);
}
assert((SrcVT == MVT::i64 || DstVT == MVT::i64) &&
"ExpandBITCAST called for non-i64 type");

View File

@ -108,6 +108,7 @@ def AddrModeT2_so : AddrMode<13>;
def AddrModeT2_pc : AddrMode<14>;
def AddrModeT2_i8s4 : AddrMode<15>;
def AddrMode_i12 : AddrMode<16>;
def AddrMode5FP16 : AddrMode<17>;
// Load / store index mode.
class IndexMode<bits<2> val> {
@ -1527,7 +1528,7 @@ class ASI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
class AHI5<bits<4> opcod1, bits<2> opcod2, dag oops, dag iops,
InstrItinClass itin,
string opc, string asm, list<dag> pattern>
: VFPI<oops, iops, AddrMode5, 4, IndexModeNone,
: VFPI<oops, iops, AddrMode5FP16, 4, IndexModeNone,
VFPLdStFrm, itin, opc, asm, "", pattern> {
list<Predicate> Predicates = [HasFullFP16];

View File

@ -69,10 +69,19 @@ def vfp_f64imm : Operand<f64>,
let ParserMatchClass = FPImmOperand;
}
def alignedload16 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
return cast<LoadSDNode>(N)->getAlignment() >= 2;
}]>;
def alignedload32 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
return cast<LoadSDNode>(N)->getAlignment() >= 4;
}]>;
def alignedstore16 : PatFrag<(ops node:$val, node:$ptr),
(store node:$val, node:$ptr), [{
return cast<StoreSDNode>(N)->getAlignment() >= 2;
}]>;
def alignedstore32 : PatFrag<(ops node:$val, node:$ptr),
(store node:$val, node:$ptr), [{
return cast<StoreSDNode>(N)->getAlignment() >= 4;
@ -113,9 +122,9 @@ def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr),
let D = VFPNeonDomain;
}
def VLDRH : AHI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5fp16:$addr),
def VLDRH : AHI5<0b1101, 0b01, (outs HPR:$Sd), (ins addrmode5fp16:$addr),
IIC_fpLoad16, "vldr", ".16\t$Sd, $addr",
[]>,
[(set HPR:$Sd, (alignedload16 addrmode5fp16:$addr))]>,
Requires<[HasFullFP16]>;
} // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in'
@ -132,9 +141,9 @@ def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr),
let D = VFPNeonDomain;
}
def VSTRH : AHI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5fp16:$addr),
def VSTRH : AHI5<0b1101, 0b00, (outs), (ins HPR:$Sd, addrmode5fp16:$addr),
IIC_fpStore16, "vstr", ".16\t$Sd, $addr",
[]>,
[(alignedstore16 HPR:$Sd, addrmode5fp16:$addr)]>,
Requires<[HasFullFP16]>;
//===----------------------------------------------------------------------===//
@ -335,9 +344,9 @@ def VADDS : ASbIn<0b11100, 0b11, 0, 0,
let TwoOperandAliasConstraint = "$Sn = $Sd" in
def VADDH : AHbI<0b11100, 0b11, 0, 0,
(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm",
[]>,
[(set HPR:$Sd, (fadd HPR:$Sn, HPR:$Sm))]>,
Sched<[WriteFPALU32]>;
let TwoOperandAliasConstraint = "$Dn = $Dd" in
@ -360,9 +369,9 @@ def VSUBS : ASbIn<0b11100, 0b11, 1, 0,
let TwoOperandAliasConstraint = "$Sn = $Sd" in
def VSUBH : AHbI<0b11100, 0b11, 1, 0,
(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm",
[]>,
[(set HPR:$Sd, (fsub HPR:$Sn, HPR:$Sm))]>,
Sched<[WriteFPALU32]>;
let TwoOperandAliasConstraint = "$Dn = $Dd" in
@ -658,17 +667,19 @@ def VCVTSD : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
let Predicates = [HasVFP2, HasDPVFP];
}
// Between half, single and double-precision. For disassembly only.
// Between half, single and double-precision.
def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
/* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm",
[/* For disassembly only; pattern left blank */]>,
[ /* intentionally left blank, see rule below */ ]>,
Requires<[HasFP16]>,
Sched<[WriteFPCVT]>;
def : Pat<(f32 (fpextend HPR:$Sm)),
(VCVTBHS (COPY_TO_REGCLASS HPR:$Sm, SPR))>;
def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
/* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
[/* For disassembly only; pattern left blank */]>,
[]>,
Requires<[HasFP16]>,
Sched<[WriteFPCVT]>;

View File

@ -307,6 +307,18 @@ def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)> {
let DiagnosticString = "operand must be a register in range [s0, s31]";
}
def HPR : RegisterClass<"ARM", [f16], 32, (sequence "S%u", 0, 31)> {
let AltOrders = [(add (decimate HPR, 2), SPR),
(add (decimate HPR, 4),
(decimate HPR, 2),
(decimate (rotl HPR, 1), 4),
(decimate (rotl HPR, 1), 2))];
let AltOrderSelect = [{
return 1 + MF.getSubtarget<ARMSubtarget>().useStride4VFPs(MF);
}];
let DiagnosticString = "operand must be a register in range [s0, s31]";
}
// Subset of SPR which can be used as a source of NEON scalars for 16-bit
// operations
def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)> {

View File

@ -158,6 +158,8 @@ static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder);
static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
@ -996,6 +998,11 @@ static DecodeStatus DecodeSPRRegisterClass(MCInst &Inst, unsigned RegNo,
return MCDisassembler::Success;
}
static DecodeStatus DecodeHPRRegisterClass(MCInst &Inst, unsigned RegNo,
uint64_t Address, const void *Decoder) {
return DecodeSPRRegisterClass(Inst, RegNo, Address, Decoder);
}
static const uint16_t DPRDecoderTable[] = {
ARM::D0, ARM::D1, ARM::D2, ARM::D3,
ARM::D4, ARM::D5, ARM::D6, ARM::D7,

View File

@ -186,7 +186,8 @@ namespace ARMII {
AddrModeT2_so = 13,
AddrModeT2_pc = 14, // +/- i12 for pc relative data
AddrModeT2_i8s4 = 15, // i8 * 4
AddrMode_i12 = 16
AddrMode_i12 = 16,
AddrMode5FP16 = 17 // i8 * 2
};
inline static const char *AddrModeToString(AddrMode addrmode) {
@ -197,6 +198,7 @@ namespace ARMII {
case AddrMode3: return "AddrMode3";
case AddrMode4: return "AddrMode4";
case AddrMode5: return "AddrMode5";
case AddrMode5FP16: return "AddrMode5FP16";
case AddrMode6: return "AddrMode6";
case AddrModeT1_1: return "AddrModeT1_1";
case AddrModeT1_2: return "AddrModeT1_2";

View File

@ -43,7 +43,7 @@ define i17 @test_funny_ints(i17 %a, i17 %b) {
}
define half @test_half(half %a, half %b) {
; CHECK: remark: {{.*}} unable to lower arguments: half (half, half)*
; CHECK: remark: {{.*}} unable to lower arguments: half (half, half)* (in function: test_half)
; CHECK-LABEL: warning: Instruction selection used fallback path for test_half
%res = fadd half %a, %b
ret half %res

View File

@ -0,0 +1,72 @@
; SOFT:
; RUN: llc < %s -mtriple=arm-none-eabi -float-abi=soft | FileCheck %s --check-prefix=CHECK-SOFT
; SOFTFP:
; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+vfp3 | FileCheck %s --check-prefix=CHECK-SOFTFP-VFP3
; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+vfp4 | FileCheck %s --check-prefix=CHECK-SOFTFP-FP16
; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+fullfp16 | FileCheck %s --check-prefix=CHECK-SOFTFP-FULLFP16
; HARD:
; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+vfp3 | FileCheck %s --check-prefix=CHECK-HARDFP-VFP3
; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+vfp4 | FileCheck %s --check-prefix=CHECK-HARDFP-FP16
; RUN: llc < %s -mtriple=arm-none-eabihf -mattr=+fullfp16 | FileCheck %s --check-prefix=CHECK-HARDFP-FULLFP16
define float @Add(float %a.coerce, float %b.coerce) local_unnamed_addr {
entry:
%0 = bitcast float %a.coerce to i32
%tmp.0.extract.trunc = trunc i32 %0 to i16
%1 = bitcast i16 %tmp.0.extract.trunc to half
%2 = bitcast float %b.coerce to i32
%tmp1.0.extract.trunc = trunc i32 %2 to i16
%3 = bitcast i16 %tmp1.0.extract.trunc to half
%add = fadd half %1, %3
%4 = bitcast half %add to i16
%tmp4.0.insert.ext = zext i16 %4 to i32
%5 = bitcast i32 %tmp4.0.insert.ext to float
ret float %5
; CHECK-SOFT: bl __aeabi_h2f
; CHECK-SOFT: bl __aeabi_h2f
; CHECK-SOFT: bl __aeabi_fadd
; CHECK-SOFT: bl __aeabi_f2h
; CHECK-SOFTFP-VFP3: bl __aeabi_h2f
; CHECK-SOFTFP-VFP3: bl __aeabi_h2f
; CHECK-SOFTFP-VFP3: vadd.f32
; CHECK-SOFTFP-VFP3: bl __aeabi_f2h
; CHECK-SOFTFP-FP16: vmov [[S2:s[0-9]]], r1
; CHECK-SOFTFP-FP16: vmov [[S0:s[0-9]]], r0
; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S2]], [[S2]]
; CHECK-SOFTFP-FP16: vcvtb.f32.f16 [[S0]], [[S0]]
; CHECK-SOFTFP-FP16: vadd.f32 [[S0]], [[S0]], [[S2]]
; CHECK-SOFTFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]]
; CHECK-SOFTFP-FP16: vmov r0, s0
; CHECK-SOFTFP-FULLFP16: strh r1, {{.*}}
; CHECK-SOFTFP-FULLFP16: strh r0, {{.*}}
; CHECK-SOFTFP-FULLFP16: vldr.16 [[S0:s[0-9]]], {{.*}}
; CHECK-SOFTFP-FULLFP16: vldr.16 [[S2:s[0-9]]], {{.*}}
; CHECK-SOFTFP-FULLFP16: vadd.f16 [[S0]], [[S2]], [[S0]]
; CHECK-SOFTFP-FULLFP16: vstr.16 [[S2:s[0-9]]], {{.*}}
; CHECK-SOFTFP-FULLFP16: ldrh r0, {{.*}}
; CHECK-SOFTFP-FULLFP16: mov pc, lr
; CHECK-HARDFP-VFP3: vmov r{{.}}, s0
; CHECK-HARDFP-VFP3: vmov{{.*}}, s1
; CHECK-HARDFP-VFP3: bl __aeabi_h2f
; CHECK-HARDFP-VFP3: bl __aeabi_h2f
; CHECK-HARDFP-VFP3: vadd.f32
; CHECK-HARDFP-VFP3: bl __aeabi_f2h
; CHECK-HARDFP-VFP3: vmov s0, r0
; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S2:s[0-9]]], s1
; CHECK-HARDFP-FP16: vcvtb.f32.f16 [[S0:s[0-9]]], s0
; CHECK-HARDFP-FP16: vadd.f32 [[S0]], [[S0]], [[S2]]
; CHECK-HARDFP-FP16: vcvtb.f16.f32 [[S0]], [[S0]]
; CHECK-HARDFP-FULLFP16: vadd.f16 s0, s0, s1
; CHECK-HARDFP-FULLFP16-NEXT: mov pc, lr
}