1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 12:41:49 +01:00
llvm-mirror/test/CodeGen/AArch64/fp16-vector-shuffle.ll
Pavel Iliin 2b3fd0441a [AArch64] Add BIT/BIF support.
This patch added generation of SIMD bitwise insert BIT/BIF instructions.
In the absence of GCC-like functionality for optimal constraints satisfaction
during register allocation the bitwise insert and select patterns are matched
by pseudo bitwise select BSP instruction with not tied def.
It is expanded later after register allocation with def tied
to BSL/BIT/BIF depending on operands registers.
This allows to get rid of redundant moves.

Reviewers: t.p.northover, samparker, dmgreen

Reviewed By: dmgreen

Differential Revision: https://reviews.llvm.org/D74147
2020-02-14 14:19:39 +00:00

375 lines
13 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=aarch64-none-eabi | FileCheck %s
; float16x4_t select_64(float16x4_t a, float16x4_t b, uint16x4_t c) { return vbsl_u16(c, a, b); }
define <4 x half> @select_64(<4 x half> %a, <4 x half> %b, <4 x i16> %c) #0 {
; CHECK-LABEL: select_64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b
; CHECK-NEXT: ret
entry:
%0 = bitcast <4 x half> %a to <4 x i16>
%1 = bitcast <4 x half> %b to <4 x i16>
%vbsl3.i = and <4 x i16> %0, %c
%2 = xor <4 x i16> %c, <i16 -1, i16 -1, i16 -1, i16 -1>
%vbsl4.i = and <4 x i16> %1, %2
%vbsl5.i = or <4 x i16> %vbsl3.i, %vbsl4.i
%3 = bitcast <4 x i16> %vbsl5.i to <4 x half>
ret <4 x half> %3
}
; float16x8_t select_128(float16x8_t a, float16x8_t b, uint16x8_t c) { return vbslq_u16(c, a, b); }
define <8 x half> @select_128(<8 x half> %a, <8 x half> %b, <8 x i16> %c) #0 {
; CHECK-LABEL: select_128:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
entry:
%0 = bitcast <8 x half> %a to <8 x i16>
%1 = bitcast <8 x half> %b to <8 x i16>
%vbsl3.i = and <8 x i16> %0, %c
%2 = xor <8 x i16> %c, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
%vbsl4.i = and <8 x i16> %1, %2
%vbsl5.i = or <8 x i16> %vbsl3.i, %vbsl4.i
%3 = bitcast <8 x i16> %vbsl5.i to <8 x half>
ret <8 x half> %3
}
; float16x4_t lane_64_64(float16x4_t a, float16x4_t b) {
; return vcopy_lane_s16(a, 1, b, 2);
; }
define <4 x half> @lane_64_64(<4 x half> %a, <4 x half> %b) #0 {
; CHECK-LABEL: lane_64_64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: mov v0.h[1], v1.h[2]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
entry:
%0 = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 6, i32 2, i32 3>
ret <4 x half> %0
}
; float16x8_t lane_128_64(float16x8_t a, float16x4_t b) {
; return vcopyq_lane_s16(a, 1, b, 2);
; }
define <8 x half> @lane_128_64(<8 x half> %a, <4 x half> %b) #0 {
; CHECK-LABEL: lane_128_64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: mov v0.h[1], v1.h[2]
; CHECK-NEXT: ret
entry:
%0 = bitcast <4 x half> %b to <4 x i16>
%vget_lane = extractelement <4 x i16> %0, i32 2
%1 = bitcast <8 x half> %a to <8 x i16>
%vset_lane = insertelement <8 x i16> %1, i16 %vget_lane, i32 1
%2 = bitcast <8 x i16> %vset_lane to <8 x half>
ret <8 x half> %2
}
; float16x4_t lane_64_128(float16x4_t a, float16x8_t b) {
; return vcopy_laneq_s16(a, 3, b, 5);
; }
define <4 x half> @lane_64_128(<4 x half> %a, <8 x half> %b) #0 {
; CHECK-LABEL: lane_64_128:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: mov v0.h[3], v1.h[5]
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
entry:
%0 = bitcast <8 x half> %b to <8 x i16>
%vgetq_lane = extractelement <8 x i16> %0, i32 5
%1 = bitcast <4 x half> %a to <4 x i16>
%vset_lane = insertelement <4 x i16> %1, i16 %vgetq_lane, i32 3
%2 = bitcast <4 x i16> %vset_lane to <4 x half>
ret <4 x half> %2
}
; float16x8_t lane_128_128(float16x8_t a, float16x8_t b) {
; return vcopyq_laneq_s16(a, 3, b, 5);
; }
define <8 x half> @lane_128_128(<8 x half> %a, <8 x half> %b) #0 {
; CHECK-LABEL: lane_128_128:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: mov v0.h[3], v1.h[5]
; CHECK-NEXT: ret
entry:
%0 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 13, i32 4, i32 5, i32 6, i32 7>
ret <8 x half> %0
}
; float16x4_t ext_64(float16x4_t a, float16x4_t b) {
; return vext_s16(a, b, 3);
; }
define <4 x half> @ext_64(<4 x half> %a, <4 x half> %b) #0 {
; CHECK-LABEL: ext_64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ext v0.8b, v0.8b, v1.8b, #6
; CHECK-NEXT: ret
entry:
%0 = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
ret <4 x half> %0
}
; float16x8_t ext_128(float16x8_t a, float16x8_t b) {
; return vextq_s16(a, b, 3);
; }
define <8 x half> @ext_128(<8 x half> %a, <8 x half> %b) #0 {
; CHECK-LABEL: ext_128:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ext v0.16b, v0.16b, v1.16b, #6
; CHECK-NEXT: ret
entry:
%0 = shufflevector <8 x half> %a, <8 x half> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
ret <8 x half> %0
}
; float16x4_t rev32_64(float16x4_t a) {
; return vrev32_s16(a);
; }
define <4 x half> @rev32_64(<4 x half> %a) #0 {
; CHECK-LABEL: rev32_64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: rev32 v0.4h, v0.4h
; CHECK-NEXT: ret
entry:
%0 = shufflevector <4 x half> %a, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
ret <4 x half> %0
}
; float16x4_t rev64_64(float16x4_t a) {
; return vrev64_s16(a);
; }
define <4 x half> @rev64_64(<4 x half> %a) #0 {
; CHECK-LABEL: rev64_64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: rev64 v0.4h, v0.4h
; CHECK-NEXT: ret
entry:
%0 = shufflevector <4 x half> %a, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
ret <4 x half> %0
}
; float16x8_t rev32_128(float16x8_t a) {
; return vrev32q_s16(a);
; }
define <8 x half> @rev32_128(<8 x half> %a) #0 {
; CHECK-LABEL: rev32_128:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: rev32 v0.8h, v0.8h
; CHECK-NEXT: ret
entry:
%0 = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
ret <8 x half> %0
}
; float16x8_t rev64_128(float16x8_t a) {
; return vrev64q_s16(a);
; }
define <8 x half> @rev64_128(<8 x half> %a) #0 {
; CHECK-LABEL: rev64_128:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: rev64 v0.8h, v0.8h
; CHECK-NEXT: ret
entry:
%0 = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
ret <8 x half> %0
}
; float16x4_t create_64(long long a) { return vcreate_f16(a); }
define <4 x half> @create_64(i64 %a) #0 {
; CHECK-LABEL: create_64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fmov d0, x0
; CHECK-NEXT: ret
entry:
%0 = bitcast i64 %a to <4 x half>
ret <4 x half> %0
}
; float16x4_t dup_64(__fp16 a) { return vdup_n_f16(a); }
define <4 x half> @dup_64(half %a) #0 {
; CHECK-LABEL: dup_64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0
; CHECK-NEXT: dup v0.4h, v0.h[0]
; CHECK-NEXT: ret
entry:
%vecinit = insertelement <4 x half> undef, half %a, i32 0
%vecinit1 = insertelement <4 x half> %vecinit, half %a, i32 1
%vecinit2 = insertelement <4 x half> %vecinit1, half %a, i32 2
%vecinit3 = insertelement <4 x half> %vecinit2, half %a, i32 3
ret <4 x half> %vecinit3
}
; float16x8_t dup_128(__fp16 a) { return vdupq_n_f16(a); }
define <8 x half> @dup_128(half %a) #0 {
; CHECK-LABEL: dup_128:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $h0 killed $h0 def $q0
; CHECK-NEXT: dup v0.8h, v0.h[0]
; CHECK-NEXT: ret
entry:
%vecinit = insertelement <8 x half> undef, half %a, i32 0
%vecinit1 = insertelement <8 x half> %vecinit, half %a, i32 1
%vecinit2 = insertelement <8 x half> %vecinit1, half %a, i32 2
%vecinit3 = insertelement <8 x half> %vecinit2, half %a, i32 3
%vecinit4 = insertelement <8 x half> %vecinit3, half %a, i32 4
%vecinit5 = insertelement <8 x half> %vecinit4, half %a, i32 5
%vecinit6 = insertelement <8 x half> %vecinit5, half %a, i32 6
%vecinit7 = insertelement <8 x half> %vecinit6, half %a, i32 7
ret <8 x half> %vecinit7
}
; float16x4_t dup_lane_64(float16x4_t a) { return vdup_lane_f16(a, 2); }
define <4 x half> @dup_lane_64(<4 x half> %a) #0 {
; CHECK-LABEL: dup_lane_64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: dup v0.4h, v0.h[2]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x half> %a, <4 x half> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
ret <4 x half> %shuffle
}
; float16x8_t dup_lane_128(float16x4_t a) { return vdupq_lane_f16(a, 2); }
define <8 x half> @dup_lane_128(<4 x half> %a) #0 {
; CHECK-LABEL: dup_lane_128:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: dup v0.8h, v0.h[2]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <4 x half> %a, <4 x half> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
ret <8 x half> %shuffle
}
; float16x4_t dup_laneq_64(float16x8_t a) { return vdup_laneq_f16(a, 2); }
define <4 x half> @dup_laneq_64(<8 x half> %a) #0 {
; CHECK-LABEL: dup_laneq_64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: dup v0.4h, v0.h[2]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x half> %a, <8 x half> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
ret <4 x half> %shuffle
}
; float16x8_t dup_laneq_128(float16x8_t a) { return vdupq_laneq_f16(a, 2); }
define <8 x half> @dup_laneq_128(<8 x half> %a) #0 {
; CHECK-LABEL: dup_laneq_128:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: dup v0.8h, v0.h[2]
; CHECK-NEXT: ret
entry:
%shuffle = shufflevector <8 x half> %a, <8 x half> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
ret <8 x half> %shuffle
}
; float16x8_t vcombine(float16x4_t a, float16x4_t b) { return vcombine_f16(a, b); }
define <8 x half> @vcombine(<4 x half> %a, <4 x half> %b) #0 {
; CHECK-LABEL: vcombine:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
; CHECK-NEXT: mov v0.d[1], v1.d[0]
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <4 x half> %a, <4 x half> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <8 x half> %shuffle.i
}
; float16x4_t get_high(float16x8_t a) { return vget_high_f16(a); }
define <4 x half> @get_high(<8 x half> %a) #0 {
; CHECK-LABEL: get_high:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x half> %a, <8 x half> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
ret <4 x half> %shuffle.i
}
; float16x4_t get_low(float16x8_t a) { return vget_low_f16(a); }
define <4 x half> @get_low(<8 x half> %a) #0 {
; CHECK-LABEL: get_low:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
entry:
%shuffle.i = shufflevector <8 x half> %a, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x half> %shuffle.i
}
; float16x4_t set_lane_64(float16x4_t a, __fp16 b) { return vset_lane_f16(b, a, 2); }
define <4 x half> @set_lane_64(<4 x half> %a, half %b) #0 {
; CHECK-LABEL: set_lane_64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: mov v0.h[2], w8
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
entry:
%0 = bitcast half %b to i16
%1 = bitcast <4 x half> %a to <4 x i16>
%vset_lane = insertelement <4 x i16> %1, i16 %0, i32 2
%2 = bitcast <4 x i16> %vset_lane to <4 x half>
ret <4 x half> %2
}
; float16x8_t set_lane_128(float16x8_t a, __fp16 b) { return vsetq_lane_f16(b, a, 2); }
define <8 x half> @set_lane_128(<8 x half> %a, half %b) #0 {
; CHECK-LABEL: set_lane_128:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1
; CHECK-NEXT: fmov w8, s1
; CHECK-NEXT: mov v0.h[2], w8
; CHECK-NEXT: ret
entry:
%0 = bitcast half %b to i16
%1 = bitcast <8 x half> %a to <8 x i16>
%vset_lane = insertelement <8 x i16> %1, i16 %0, i32 2
%2 = bitcast <8 x i16> %vset_lane to <8 x half>
ret <8 x half> %2
}
; __fp16 get_lane_64(float16x4_t a) { return vget_lane_f16(a, 2); }
define half @get_lane_64(<4 x half> %a) #0 {
; CHECK-LABEL: get_lane_64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
; CHECK-NEXT: umov w8, v0.h[2]
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-NEXT: ret
entry:
%0 = bitcast <4 x half> %a to <4 x i16>
%vget_lane = extractelement <4 x i16> %0, i32 2
%1 = bitcast i16 %vget_lane to half
ret half %1
}
; __fp16 get_lane_128(float16x8_t a) { return vgetq_lane_f16(a, 2); }
define half @get_lane_128(<8 x half> %a) #0 {
; CHECK-LABEL: get_lane_128:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: umov w8, v0.h[2]
; CHECK-NEXT: fmov s0, w8
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-NEXT: ret
entry:
%0 = bitcast <8 x half> %a to <8 x i16>
%vgetq_lane = extractelement <8 x i16> %0, i32 2
%1 = bitcast i16 %vgetq_lane to half
ret half %1
}