diff --git a/lib/Target/ARM/ARMCallingConv.h b/lib/Target/ARM/ARMCallingConv.h index 284b67fd59b..63bf48abb7a 100644 --- a/lib/Target/ARM/ARMCallingConv.h +++ b/lib/Target/ARM/ARMCallingConv.h @@ -217,12 +217,15 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned &ValNo, MVT &ValVT, break; } + case MVT::f16: case MVT::f32: RegList = SRegList; break; + case MVT::v4f16: case MVT::f64: RegList = DRegList; break; + case MVT::v8f16: case MVT::v2f64: RegList = QRegList; break; diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td index 11bf492815c..f173e423f3e 100644 --- a/lib/Target/ARM/ARMCallingConv.td +++ b/lib/Target/ARM/ARMCallingConv.td @@ -160,8 +160,8 @@ def CC_ARM_AAPCS : CallingConv<[ CCIfNest>, // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType>, // Pass SwiftSelf in a callee saved register. CCIfSwiftSelf>>, @@ -176,8 +176,8 @@ def CC_ARM_AAPCS : CallingConv<[ def RetCC_ARM_AAPCS : CallingConv<[ // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v8f16,v16i8, v4f32], CCBitConvertToType>, // Pass SwiftSelf in a callee saved register. CCIfSwiftSelf>>, @@ -201,8 +201,8 @@ def CC_ARM_AAPCS_VFP : CallingConv<[ CCIfByVal>, // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType>, // Pass SwiftSelf in a callee saved register. CCIfSwiftSelf>>, @@ -222,8 +222,8 @@ def CC_ARM_AAPCS_VFP : CallingConv<[ def RetCC_ARM_AAPCS_VFP : CallingConv<[ // Handle all vector types as either f64 or v2f64. - CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType>, - CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType>, + CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType>, + CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType>, // Pass SwiftSelf in a callee saved register. CCIfSwiftSelf>>, diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 94fe84c8751..91d1aceacaa 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1893,12 +1893,14 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs, default: llvm_unreachable("unhandled vst type"); // Double-register operations: case MVT::v8i8: OpcodeIndex = 0; break; + case MVT::v4f16: case MVT::v4i16: OpcodeIndex = 1; break; case MVT::v2f32: case MVT::v2i32: OpcodeIndex = 2; break; case MVT::v1i64: OpcodeIndex = 3; break; // Quad-register operations: case MVT::v16i8: OpcodeIndex = 0; break; + case MVT::v8f16: case MVT::v8i16: OpcodeIndex = 1; break; case MVT::v4f32: case MVT::v4i32: OpcodeIndex = 2; break; diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index e86e5bce2b6..dc4185752ea 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -565,6 +565,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, addQRTypeForNEON(MVT::v4i32); addQRTypeForNEON(MVT::v2i64); + if (Subtarget->hasFullFP16()) { + addQRTypeForNEON(MVT::v8f16); + addDRTypeForNEON(MVT::v4f16); + } + // v2f64 is legal so that QR subregs can be extracted as f64 elements, but // neither Neon nor VFP support any arithmetic operations on it. // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively @@ -3727,9 +3732,9 @@ SDValue ARMTargetLowering::LowerFormalArguments( RC = &ARM::HPRRegClass; else if (RegVT == MVT::f32) RC = &ARM::SPRRegClass; - else if (RegVT == MVT::f64) + else if (RegVT == MVT::f64 || RegVT == MVT::v4f16) RC = &ARM::DPRRegClass; - else if (RegVT == MVT::v2f64) + else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16) RC = &ARM::QPRRegClass; else if (RegVT == MVT::i32) RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index ca56835f984..3137601b2b5 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -6971,9 +6971,11 @@ def : Pat<(f64 (bitconvert (v1i64 DPR:$src))), (f64 DPR:$src)>; let Predicates = [IsLE] in { def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (f64 DPR:$src)>; def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (f64 DPR:$src)>; + def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (f64 DPR:$src)>; def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (f64 DPR:$src)>; def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (f64 DPR:$src)>; def : Pat<(v2f32 (bitconvert (f64 DPR:$src))), (v2f32 DPR:$src)>; + def : Pat<(v4f16 (bitconvert (f64 DPR:$src))), (v4f16 DPR:$src)>; def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>; } def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>; @@ -7002,6 +7004,7 @@ let Predicates = [IsLE] in { def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (v8f16 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>; @@ -7019,6 +7022,7 @@ def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>; let Predicates = [IsLE] in { def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>; def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>; def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>; def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>; } @@ -7044,6 +7048,7 @@ let Predicates = [IsBE] in { def : Pat<(v8i8 (bitconvert (f64 DPR:$src))), (VREV64d8 DPR:$src)>; def : Pat<(v8i8 (bitconvert (v2f32 DPR:$src))), (VREV32d8 DPR:$src)>; def : Pat<(f64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>; + def : Pat<(f64 (bitconvert (v4f16 DPR:$src))), (VREV64d16 DPR:$src)>; def : Pat<(f64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>; def : Pat<(f64 (bitconvert (v8i8 DPR:$src))), (VREV64d8 DPR:$src)>; def : Pat<(f64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>; @@ -7065,6 +7070,7 @@ let Predicates = [IsBE] in { def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (VREV16q8 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>; def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (VREV64q8 QPR:$src)>; def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (VREV32q8 QPR:$src)>; @@ -7073,10 +7079,12 @@ let Predicates = [IsBE] in { def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8 QPR:$src)>; def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>; def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>; + def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (VREV32q16 QPR:$src)>; def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (VREV32q8 QPR:$src)>; def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>; def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>; def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>; + def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (VREV64q16 QPR:$src)>; def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (VREV64q8 QPR:$src)>; def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>; } diff --git a/test/CodeGen/ARM/fp16-intrinsic-vector-1op.ll b/test/CodeGen/ARM/fp16-intrinsic-vector-1op.ll new file mode 100644 index 00000000000..f95def9d4d6 --- /dev/null +++ b/test/CodeGen/ARM/fp16-intrinsic-vector-1op.ll @@ -0,0 +1,39 @@ +; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+v8.2a,+fullfp16,+neon -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-HARD +; RUN: llc < %s -mtriple=armeb-none-eabi -mattr=+v8.2a,+fullfp16,+neon -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-HARD-BE +; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+v8.2a,+fullfp16,+neon | FileCheck %s --check-prefixes=CHECK,CHECK-SOFTFP +; RUN: llc < %s -mtriple=armeb-none-eabi -mattr=+v8.2a,+fullfp16,+neon | FileCheck %s --check-prefixes=CHECK,CHECK-SOFTFP-BE + +declare <8 x half> @llvm.fabs.v8f16(<8 x half>) + +define dso_local <8 x half> @t_vabsq_f16(<8 x half> %a) { +; CHECK-LABEL: t_vabsq_f16: + +; CHECK-HARD: vabs.f16 q0, q0 +; CHECK-HARD-NEXT: bx lr + +; CHECK-HARD-BE: vrev64.16 [[Q8:q[0-9]+]], q0 +; CHECK-HARD-BE-NEXT: vabs.f16 [[Q8]], [[Q8]] +; CHECK-HARD-BE-NEXT: vrev64.16 q0, [[Q8]] +; CHECK-HARD-BE-NEXT: bx lr + +; CHECK-SOFTFP: vmov d{{.*}}, r2, r3 +; CHECK-SOFTFP: vmov d{{.*}}, r0, r1 +; CHECK-SOFTFP: vabs.f16 q{{.*}}, q{{.*}} +; CHECK-SOFTFP: vmov r0, r1, d{{.*}} +; CHECK-SOFTFP: vmov r2, r3, d{{.*}} +; CHECK-SOFTFP: bx lr + +; CHECK-SOFTFP-BE: vmov [[D17:d[0-9]+]], r3, r2 +; CHECK-SOFTFP-BE: vmov [[D16:d[0-9]+]], r1, r0 +; CHECK-SOFTFP-BE: vrev64.16 [[Q8:q[0-9]+]], [[Q8]] +; CHECK-SOFTFP-BE: vabs.f16 [[Q8]], [[Q8]] +; CHECK-SOFTFP-BE: vrev64.16 [[Q8]], [[Q8]] +; CHECK-SOFTFP-BE: vmov r1, r0, [[D16]] +; CHECK-SOFTFP-BE: vmov r3, r2, [[D17]] +; CHECK-SOFTFP-BE: bx lr + +entry: + %vabs1.i = tail call <8 x half> @llvm.fabs.v8f16(<8 x half> %a) #3 + ret <8 x half> %vabs1.i +} + diff --git a/test/CodeGen/ARM/fp16-intrinsic-vector-2op.ll b/test/CodeGen/ARM/fp16-intrinsic-vector-2op.ll new file mode 100644 index 00000000000..c1ada259419 --- /dev/null +++ b/test/CodeGen/ARM/fp16-intrinsic-vector-2op.ll @@ -0,0 +1,21 @@ +; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+v8.2a,+fullfp16,+neon -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-HARD +; RUN: llc < %s -mtriple=arm-none-eabi -mattr=+v8.2a,+fullfp16,+neon | FileCheck %s --check-prefixes=CHECK,CHECK-SOFTFP + +declare <4 x half> @llvm.arm.neon.vpadd.v4f16(<4 x half>, <4 x half>) + +define dso_local <4 x half> @t_vpadd_f16(<4 x half> %a, <4 x half> %b) { +; CHECK: t_vpadd_f16: + +; CHECK-HARD: vpadd.f16 d0, d0, d1 +; CHECK-HARD-NEXT: bx lr + +; CHECK-SOFTFP: vmov [[D1:d[0-9]+]], r2, r3 +; CHECK-SOFTFP: vmov [[D2:d[0-9]+]], r0, r1 +; CHECK-SOFTFP: vpadd.f16 [[D3:d[0-9]+]], [[D2]], [[D1]] +; CHECK-SOFTFP: vmov r0, r1, [[D3]] +; CHECK-SOFTFP: bx lr + +entry: + %vpadd_v2.i = tail call <4 x half> @llvm.arm.neon.vpadd.v4f16(<4 x half> %a, <4 x half> %b) + ret <4 x half> %vpadd_v2.i +}