mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
ede15a7ec3
This change adds two FP16 extraction and two insertion patterns (one per possible vector length). Extractions are handled by copying a Q/D register into one of VFP2 class registers, where single FP32 sub-registers can be accessed. Then the extraction of even lanes are simple sub-register extractions (because we don't care about the top parts of registers for FP16 operations). Odd lanes need an additional VMOVX instruction. Unfortunately, insertions cannot be handled in the same way, because: * There is no instruction to insert FP16 into an even lane (VINS only works with odd lanes) * The patterns for odd lanes will have a form of a DAG (not a tree), and will not be implementable in pure tablegen Because of this insertions are handled in the same way as 16-bit integer insertions (with conversions between FP registers and GPRs using VMOVHR instructions). Without these patterns the ARM backend would sometimes fail during instruction selection. This patch also adds patterns which combine: * an FP16 element extraction and a store into a single VST1 instruction * an FP16 load and insertion into a single VLD1 instruction Differential Revision: https://reviews.llvm.org/D62651 llvm-svn: 362482
57 lines
1.7 KiB
LLVM
57 lines
1.7 KiB
LLVM
; RUN: llc -mtriple=arm-eabi -mattr=+armv8.2-a,+fullfp16,+neon -float-abi=hard -O1 < %s | FileCheck %s
|
|
; RUN: llc -mtriple=arm-eabi -mattr=+armv8.2-a,+fullfp16,+neon -float-abi=soft -O1 < %s | FileCheck %s
|
|
|
|
define <4 x half> @vld1d_lane_f16(half* %pa, <4 x half> %v4) nounwind {
|
|
; CHECK-LABEL: vld1d_lane_f16:
|
|
; CHECK: vld1.16 {d{{[0-9]+}}[3]}, [r0:16]
|
|
entry:
|
|
%a = load half, half* %pa
|
|
%res = insertelement <4 x half> %v4, half %a, i32 3
|
|
ret <4 x half> %res
|
|
}
|
|
|
|
define <8 x half> @vld1q_lane_f16_1(half* %pa, <8 x half> %v8) nounwind {
|
|
; CHECK-LABEL: vld1q_lane_f16_1:
|
|
; CHECK: vld1.16 {d{{[0-9]+}}[1]}, [r0:16]
|
|
entry:
|
|
%a = load half, half* %pa
|
|
%res = insertelement <8 x half> %v8, half %a, i32 1
|
|
ret <8 x half> %res
|
|
}
|
|
|
|
define <8 x half> @vld1q_lane_f16_7(half* %pa, <8 x half> %v8) nounwind {
|
|
; CHECK-LABEL: vld1q_lane_f16_7:
|
|
; CHECK: vld1.16 {d{{[0-9]+}}[3]}, [r0:16]
|
|
entry:
|
|
%a = load half, half* %pa
|
|
%res = insertelement <8 x half> %v8, half %a, i32 7
|
|
ret <8 x half> %res
|
|
}
|
|
|
|
define void @vst1d_lane_f16(half* %pa, <4 x half> %v4) nounwind {
|
|
; CHECK-LABEL: vst1d_lane_f16:
|
|
; CHECK: vst1.16 {d{{[0-9]+}}[3]}, [r0:16]
|
|
entry:
|
|
%a = extractelement <4 x half> %v4, i32 3
|
|
store half %a, half* %pa
|
|
ret void
|
|
}
|
|
|
|
define void @vst1q_lane_f16_7(half* %pa, <8 x half> %v8) nounwind {
|
|
; CHECK-LABEL: vst1q_lane_f16_7:
|
|
; CHECK: vst1.16 {d{{[0-9]+}}[3]}, [r0:16]
|
|
entry:
|
|
%a = extractelement <8 x half> %v8, i32 7
|
|
store half %a, half* %pa
|
|
ret void
|
|
}
|
|
|
|
define void @vst1q_lane_f16_1(half* %pa, <8 x half> %v8) nounwind {
|
|
; CHECK-LABEL: vst1q_lane_f16_1:
|
|
; CHECK: vst1.16 {d{{[0-9]+}}[1]}, [r0:16]
|
|
entry:
|
|
%a = extractelement <8 x half> %v8, i32 1
|
|
store half %a, half* %pa
|
|
ret void
|
|
}
|