From f8a9b17bf4725e83b42806568c69428d1e05a732 Mon Sep 17 00:00:00 2001 From: Tim Northover Date: Tue, 22 Apr 2014 12:45:32 +0000 Subject: [PATCH] AArch64/ARM64: add extra testing from AArch64 to ARM64 llvm-svn: 206887 --- .../AArch64/neon-scalar-extract-narrow.ll | 1 + test/CodeGen/AArch64/neon-scalar-fabd.ll | 1 + test/CodeGen/AArch64/neon-scalar-fcvt.ll | 1 + .../CodeGen/AArch64/neon-scalar-fp-compare.ll | 1 + test/CodeGen/AArch64/neon-scalar-mul.ll | 1 + test/CodeGen/AArch64/neon-scalar-neg.ll | 1 + test/CodeGen/AArch64/neon-scalar-recip.ll | 1 + .../AArch64/neon-scalar-reduce-pairwise.ll | 1 + .../AArch64/neon-scalar-rounding-shift.ll | 2 +- .../AArch64/neon-scalar-saturating-add-sub.ll | 1 + .../neon-scalar-saturating-rounding-shift.ll | 1 + .../AArch64/neon-scalar-saturating-shift.ll | 1 + test/CodeGen/AArch64/neon-scalar-shift-imm.ll | 1 + test/CodeGen/AArch64/neon-scalar-shift.ll | 1 + test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll | 1 + .../CodeGen/AArch64/neon-vector-list-spill.ll | 1 + test/CodeGen/ARM64/aarch64-neon-simd-shift.ll | 663 ++++++++++++++++++ test/CodeGen/ARM64/aarch64-neon-simd-vget.ll | 225 ++++++ .../ARM64/aarch64-neon-vector-list-spill.ll | 175 +++++ 19 files changed, 1079 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/ARM64/aarch64-neon-simd-shift.ll create mode 100644 test/CodeGen/ARM64/aarch64-neon-simd-vget.ll create mode 100644 test/CodeGen/ARM64/aarch64-neon-vector-list-spill.ll diff --git a/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll b/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll index faf521bc889..2004226bd13 100644 --- a/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll +++ b/test/CodeGen/AArch64/neon-scalar-extract-narrow.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s +; intrinsic wrangling that arm64 does differently. define i8 @test_vqmovunh_s16(i16 %a) { ; CHECK: test_vqmovunh_s16 diff --git a/test/CodeGen/AArch64/neon-scalar-fabd.ll b/test/CodeGen/AArch64/neon-scalar-fabd.ll index 6343310a3c0..9b2ae2bbc0a 100644 --- a/test/CodeGen/AArch64/neon-scalar-fabd.ll +++ b/test/CodeGen/AArch64/neon-scalar-fabd.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s +; arm64 has these two tests in vabs.ll define float @test_vabds_f32(float %a, float %b) { ; CHECK-LABEL: test_vabds_f32 diff --git a/test/CodeGen/AArch64/neon-scalar-fcvt.ll b/test/CodeGen/AArch64/neon-scalar-fcvt.ll index 6cf30a7df3b..341ed69b482 100644 --- a/test/CodeGen/AArch64/neon-scalar-fcvt.ll +++ b/test/CodeGen/AArch64/neon-scalar-fcvt.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s +; arm64 duplicates these tests in cvt.ll ;; Scalar Floating-point Convert diff --git a/test/CodeGen/AArch64/neon-scalar-fp-compare.ll b/test/CodeGen/AArch64/neon-scalar-fp-compare.ll index e0dce1336d8..b17d8655c6f 100644 --- a/test/CodeGen/AArch64/neon-scalar-fp-compare.ll +++ b/test/CodeGen/AArch64/neon-scalar-fp-compare.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s +; arm64 does not use intrinsics for comparisons. ;; Scalar Floating-point Compare diff --git a/test/CodeGen/AArch64/neon-scalar-mul.ll b/test/CodeGen/AArch64/neon-scalar-mul.ll index 991037f6cb8..ac44c090b41 100644 --- a/test/CodeGen/AArch64/neon-scalar-mul.ll +++ b/test/CodeGen/AArch64/neon-scalar-mul.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s +; Just intrinsic wrangling, and arm64 does scalar differently anyway. define i16 @test_vqdmulhh_s16(i16 %a, i16 %b) { ; CHECK: test_vqdmulhh_s16 diff --git a/test/CodeGen/AArch64/neon-scalar-neg.ll b/test/CodeGen/AArch64/neon-scalar-neg.ll index b48e86887af..6eb0a1a152b 100644 --- a/test/CodeGen/AArch64/neon-scalar-neg.ll +++ b/test/CodeGen/AArch64/neon-scalar-neg.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s +; Intrinsic wrangling. arm64 does it differently. define i64 @test_vnegd_s64(i64 %a) { ; CHECK: test_vnegd_s64 diff --git a/test/CodeGen/AArch64/neon-scalar-recip.ll b/test/CodeGen/AArch64/neon-scalar-recip.ll index 100839b14e6..4b1ca6e91c8 100644 --- a/test/CodeGen/AArch64/neon-scalar-recip.ll +++ b/test/CodeGen/AArch64/neon-scalar-recip.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s +; duplicates arm64 tests in vsqrt.ll define float @test_vrecpss_f32(float %a, float %b) { ; CHECK: test_vrecpss_f32 diff --git a/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll b/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll index 33ce5cf6ce6..2b94d7524eb 100644 --- a/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll +++ b/test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s +; Intrinsic wrangling. Duplicates various arm64 tests. declare <1 x i64> @llvm.aarch64.neon.vpadd(<2 x i64>) diff --git a/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll b/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll index 7c9ffa0727b..ae097afb3a3 100644 --- a/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll +++ b/test/CodeGen/AArch64/neon-scalar-rounding-shift.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s - +; Duplicates arm64'd vshift.ll declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>) declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>) diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll b/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll index 5c010ef0063..ea5f8f9286f 100644 --- a/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll +++ b/test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s +; Intrinsic wrangling and arm64 does it differently. declare <1 x i8> @llvm.arm.neon.vqaddu.v1i8(<1 x i8>, <1 x i8>) declare <1 x i8> @llvm.arm.neon.vqadds.v1i8(<1 x i8>, <1 x i8>) diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll b/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll index dbf9669202c..e78c55bfe16 100644 --- a/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll +++ b/test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s +; Intrinsic wrangling and arm64 does it differently. declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>) declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>) diff --git a/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll b/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll index 0a1f4c9b3f5..b7f956cf612 100644 --- a/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll +++ b/test/CodeGen/AArch64/neon-scalar-saturating-shift.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s +; Intrinsic wrangling and arm64 does it differently. declare <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64>, <1 x i64>) declare <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64>, <1 x i64>) diff --git a/test/CodeGen/AArch64/neon-scalar-shift-imm.ll b/test/CodeGen/AArch64/neon-scalar-shift-imm.ll index 62243618171..a2bdae5f52c 100644 --- a/test/CodeGen/AArch64/neon-scalar-shift-imm.ll +++ b/test/CodeGen/AArch64/neon-scalar-shift-imm.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s +; Intrinsic wrangling & arm64 does it differently. define i64 @test_vshrd_n_s64(i64 %a) { ; CHECK: test_vshrd_n_s64 diff --git a/test/CodeGen/AArch64/neon-scalar-shift.ll b/test/CodeGen/AArch64/neon-scalar-shift.ll index b712ea4d609..cf3fc0c486a 100644 --- a/test/CodeGen/AArch64/neon-scalar-shift.ll +++ b/test/CodeGen/AArch64/neon-scalar-shift.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s +; Duplicates existing arm64 tests in vshift.ll and vcmp.ll declare <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64>, <1 x i64>) declare <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64>, <1 x i64>) diff --git a/test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll b/test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll index b9396ac2e47..142b0a8bd53 100644 --- a/test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll +++ b/test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll @@ -1,4 +1,5 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s +; not relevant for arm64: <1 x iN> isn't legal ; This file tests the spill of FPR8/FPR16. The volatile loads/stores force the ; allocator to keep the value live until it's needed. diff --git a/test/CodeGen/AArch64/neon-vector-list-spill.ll b/test/CodeGen/AArch64/neon-vector-list-spill.ll index a04937e72ac..5df0aacb38a 100644 --- a/test/CodeGen/AArch64/neon-vector-list-spill.ll +++ b/test/CodeGen/AArch64/neon-vector-list-spill.ll @@ -1,4 +1,5 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast +; arm64 has separate copy as aarch64-neon-vector-list-spill.ll ; FIXME: We should not generate ld/st for such register spill/fill, because the ; test case seems very simple and the register pressure is not high. If the diff --git a/test/CodeGen/ARM64/aarch64-neon-simd-shift.ll b/test/CodeGen/ARM64/aarch64-neon-simd-shift.ll new file mode 100644 index 00000000000..2fd2c1e35ce --- /dev/null +++ b/test/CodeGen/ARM64/aarch64-neon-simd-shift.ll @@ -0,0 +1,663 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) { +; CHECK: test_vshr_n_s8 +; CHECK: sshr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 + %vshr_n = ashr <8 x i8> %a, + ret <8 x i8> %vshr_n +} + +define <4 x i16> @test_vshr_n_s16(<4 x i16> %a) { +; CHECK: test_vshr_n_s16 +; CHECK: sshr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 + %vshr_n = ashr <4 x i16> %a, + ret <4 x i16> %vshr_n +} + +define <2 x i32> @test_vshr_n_s32(<2 x i32> %a) { +; CHECK: test_vshr_n_s32 +; CHECK: sshr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 + %vshr_n = ashr <2 x i32> %a, + ret <2 x i32> %vshr_n +} + +define <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) { +; CHECK: test_vshrq_n_s8 +; CHECK: sshr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 + %vshr_n = ashr <16 x i8> %a, + ret <16 x i8> %vshr_n +} + +define <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) { +; CHECK: test_vshrq_n_s16 +; CHECK: sshr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 + %vshr_n = ashr <8 x i16> %a, + ret <8 x i16> %vshr_n +} + +define <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) { +; CHECK: test_vshrq_n_s32 +; CHECK: sshr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 + %vshr_n = ashr <4 x i32> %a, + ret <4 x i32> %vshr_n +} + +define <2 x i64> @test_vshrq_n_s64(<2 x i64> %a) { +; CHECK: test_vshrq_n_s64 +; CHECK: sshr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 + %vshr_n = ashr <2 x i64> %a, + ret <2 x i64> %vshr_n +} + +define <8 x i8> @test_vshr_n_u8(<8 x i8> %a) { +; CHECK: test_vshr_n_u8 +; CHECK: ushr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 + %vshr_n = lshr <8 x i8> %a, + ret <8 x i8> %vshr_n +} + +define <4 x i16> @test_vshr_n_u16(<4 x i16> %a) { +; CHECK: test_vshr_n_u16 +; CHECK: ushr {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 + %vshr_n = lshr <4 x i16> %a, + ret <4 x i16> %vshr_n +} + +define <2 x i32> @test_vshr_n_u32(<2 x i32> %a) { +; CHECK: test_vshr_n_u32 +; CHECK: ushr {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 + %vshr_n = lshr <2 x i32> %a, + ret <2 x i32> %vshr_n +} + +define <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) { +; CHECK: test_vshrq_n_u8 +; CHECK: ushr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 + %vshr_n = lshr <16 x i8> %a, + ret <16 x i8> %vshr_n +} + +define <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) { +; CHECK: test_vshrq_n_u16 +; CHECK: ushr {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 + %vshr_n = lshr <8 x i16> %a, + ret <8 x i16> %vshr_n +} + +define <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) { +; CHECK: test_vshrq_n_u32 +; CHECK: ushr {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 + %vshr_n = lshr <4 x i32> %a, + ret <4 x i32> %vshr_n +} + +define <2 x i64> @test_vshrq_n_u64(<2 x i64> %a) { +; CHECK: test_vshrq_n_u64 +; CHECK: ushr {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 + %vshr_n = lshr <2 x i64> %a, + ret <2 x i64> %vshr_n +} + +define <8 x i8> @test_vsra_n_s8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vsra_n_s8 +; CHECK: ssra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 + %vsra_n = ashr <8 x i8> %b, + %1 = add <8 x i8> %vsra_n, %a + ret <8 x i8> %1 +} + +define <4 x i16> @test_vsra_n_s16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vsra_n_s16 +; CHECK: ssra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 + %vsra_n = ashr <4 x i16> %b, + %1 = add <4 x i16> %vsra_n, %a + ret <4 x i16> %1 +} + +define <2 x i32> @test_vsra_n_s32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vsra_n_s32 +; CHECK: ssra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 + %vsra_n = ashr <2 x i32> %b, + %1 = add <2 x i32> %vsra_n, %a + ret <2 x i32> %1 +} + +define <16 x i8> @test_vsraq_n_s8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vsraq_n_s8 +; CHECK: ssra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 + %vsra_n = ashr <16 x i8> %b, + %1 = add <16 x i8> %vsra_n, %a + ret <16 x i8> %1 +} + +define <8 x i16> @test_vsraq_n_s16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vsraq_n_s16 +; CHECK: ssra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 + %vsra_n = ashr <8 x i16> %b, + %1 = add <8 x i16> %vsra_n, %a + ret <8 x i16> %1 +} + +define <4 x i32> @test_vsraq_n_s32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vsraq_n_s32 +; CHECK: ssra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 + %vsra_n = ashr <4 x i32> %b, + %1 = add <4 x i32> %vsra_n, %a + ret <4 x i32> %1 +} + +define <2 x i64> @test_vsraq_n_s64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vsraq_n_s64 +; CHECK: ssra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 + %vsra_n = ashr <2 x i64> %b, + %1 = add <2 x i64> %vsra_n, %a + ret <2 x i64> %1 +} + +define <8 x i8> @test_vsra_n_u8(<8 x i8> %a, <8 x i8> %b) { +; CHECK: test_vsra_n_u8 +; CHECK: usra {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #3 + %vsra_n = lshr <8 x i8> %b, + %1 = add <8 x i8> %vsra_n, %a + ret <8 x i8> %1 +} + +define <4 x i16> @test_vsra_n_u16(<4 x i16> %a, <4 x i16> %b) { +; CHECK: test_vsra_n_u16 +; CHECK: usra {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #3 + %vsra_n = lshr <4 x i16> %b, + %1 = add <4 x i16> %vsra_n, %a + ret <4 x i16> %1 +} + +define <2 x i32> @test_vsra_n_u32(<2 x i32> %a, <2 x i32> %b) { +; CHECK: test_vsra_n_u32 +; CHECK: usra {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #3 + %vsra_n = lshr <2 x i32> %b, + %1 = add <2 x i32> %vsra_n, %a + ret <2 x i32> %1 +} + +define <16 x i8> @test_vsraq_n_u8(<16 x i8> %a, <16 x i8> %b) { +; CHECK: test_vsraq_n_u8 +; CHECK: usra {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #3 + %vsra_n = lshr <16 x i8> %b, + %1 = add <16 x i8> %vsra_n, %a + ret <16 x i8> %1 +} + +define <8 x i16> @test_vsraq_n_u16(<8 x i16> %a, <8 x i16> %b) { +; CHECK: test_vsraq_n_u16 +; CHECK: usra {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #3 + %vsra_n = lshr <8 x i16> %b, + %1 = add <8 x i16> %vsra_n, %a + ret <8 x i16> %1 +} + +define <4 x i32> @test_vsraq_n_u32(<4 x i32> %a, <4 x i32> %b) { +; CHECK: test_vsraq_n_u32 +; CHECK: usra {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #3 + %vsra_n = lshr <4 x i32> %b, + %1 = add <4 x i32> %vsra_n, %a + ret <4 x i32> %1 +} + +define <2 x i64> @test_vsraq_n_u64(<2 x i64> %a, <2 x i64> %b) { +; CHECK: test_vsraq_n_u64 +; CHECK: usra {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #3 + %vsra_n = lshr <2 x i64> %b, + %1 = add <2 x i64> %vsra_n, %a + ret <2 x i64> %1 +} + +define <8 x i8> @test_vshrn_n_s16(<8 x i16> %a) { +; CHECK: test_vshrn_n_s16 +; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3 + %1 = ashr <8 x i16> %a, + %vshrn_n = trunc <8 x i16> %1 to <8 x i8> + ret <8 x i8> %vshrn_n +} + +define <4 x i16> @test_vshrn_n_s32(<4 x i32> %a) { +; CHECK: test_vshrn_n_s32 +; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9 + %1 = ashr <4 x i32> %a, + %vshrn_n = trunc <4 x i32> %1 to <4 x i16> + ret <4 x i16> %vshrn_n +} + +define <2 x i32> @test_vshrn_n_s64(<2 x i64> %a) { +; CHECK: test_vshrn_n_s64 +; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19 + %1 = ashr <2 x i64> %a, + %vshrn_n = trunc <2 x i64> %1 to <2 x i32> + ret <2 x i32> %vshrn_n +} + +define <8 x i8> @test_vshrn_n_u16(<8 x i16> %a) { +; CHECK: test_vshrn_n_u16 +; CHECK: shrn {{v[0-9]+}}.8b, {{v[0-9]+}}.8h, #3 + %1 = lshr <8 x i16> %a, + %vshrn_n = trunc <8 x i16> %1 to <8 x i8> + ret <8 x i8> %vshrn_n +} + +define <4 x i16> @test_vshrn_n_u32(<4 x i32> %a) { +; CHECK: test_vshrn_n_u32 +; CHECK: shrn {{v[0-9]+}}.4h, {{v[0-9]+}}.4s, #9 + %1 = lshr <4 x i32> %a, + %vshrn_n = trunc <4 x i32> %1 to <4 x i16> + ret <4 x i16> %vshrn_n +} + +define <2 x i32> @test_vshrn_n_u64(<2 x i64> %a) { +; CHECK: test_vshrn_n_u64 +; CHECK: shrn {{v[0-9]+}}.2s, {{v[0-9]+}}.2d, #19 + %1 = lshr <2 x i64> %a, + %vshrn_n = trunc <2 x i64> %1 to <2 x i32> + ret <2 x i32> %vshrn_n +} + +define <16 x i8> @test_vshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) { +; CHECK: test_vshrn_high_n_s16 +; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 + %1 = ashr <8 x i16> %b, + %vshrn_n = trunc <8 x i16> %1 to <8 x i8> + %2 = bitcast <8 x i8> %a to <1 x i64> + %3 = bitcast <8 x i8> %vshrn_n to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> + %4 = bitcast <2 x i64> %shuffle.i to <16 x i8> + ret <16 x i8> %4 +} + +define <8 x i16> @test_vshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) { +; CHECK: test_vshrn_high_n_s32 +; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 + %1 = ashr <4 x i32> %b, + %vshrn_n = trunc <4 x i32> %1 to <4 x i16> + %2 = bitcast <4 x i16> %a to <1 x i64> + %3 = bitcast <4 x i16> %vshrn_n to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> + %4 = bitcast <2 x i64> %shuffle.i to <8 x i16> + ret <8 x i16> %4 +} + +define <4 x i32> @test_vshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) { +; CHECK: test_vshrn_high_n_s64 +; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 + %1 = bitcast <2 x i32> %a to <1 x i64> + %2 = ashr <2 x i64> %b, + %vshrn_n = trunc <2 x i64> %2 to <2 x i32> + %3 = bitcast <2 x i32> %vshrn_n to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> + %4 = bitcast <2 x i64> %shuffle.i to <4 x i32> + ret <4 x i32> %4 +} + +define <16 x i8> @test_vshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) { +; CHECK: test_vshrn_high_n_u16 +; CHECK: shrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 + %1 = lshr <8 x i16> %b, + %vshrn_n = trunc <8 x i16> %1 to <8 x i8> + %2 = bitcast <8 x i8> %a to <1 x i64> + %3 = bitcast <8 x i8> %vshrn_n to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> + %4 = bitcast <2 x i64> %shuffle.i to <16 x i8> + ret <16 x i8> %4 +} + +define <8 x i16> @test_vshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) { +; CHECK: test_vshrn_high_n_u32 +; CHECK: shrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 + %1 = lshr <4 x i32> %b, + %vshrn_n = trunc <4 x i32> %1 to <4 x i16> + %2 = bitcast <4 x i16> %a to <1 x i64> + %3 = bitcast <4 x i16> %vshrn_n to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %2, <1 x i64> %3, <2 x i32> + %4 = bitcast <2 x i64> %shuffle.i to <8 x i16> + ret <8 x i16> %4 +} + +define <4 x i32> @test_vshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) { +; CHECK: test_vshrn_high_n_u64 +; CHECK: shrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 + %1 = bitcast <2 x i32> %a to <1 x i64> + %2 = lshr <2 x i64> %b, + %vshrn_n = trunc <2 x i64> %2 to <2 x i32> + %3 = bitcast <2 x i32> %vshrn_n to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %3, <2 x i32> + %4 = bitcast <2 x i64> %shuffle.i to <4 x i32> + ret <4 x i32> %4 +} + +define <16 x i8> @test_vqshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) { +; CHECK: test_vqshrun_high_n_s16 +; CHECK: sqshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 + %vqshrun = tail call <8 x i8> @llvm.arm64.neon.sqshrun.v8i8(<8 x i16> %b, i32 3) + %1 = bitcast <8 x i8> %a to <1 x i64> + %2 = bitcast <8 x i8> %vqshrun to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> + %3 = bitcast <2 x i64> %shuffle.i to <16 x i8> + ret <16 x i8> %3 +} + +define <8 x i16> @test_vqshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) { +; CHECK: test_vqshrun_high_n_s32 +; CHECK: sqshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 + %vqshrun = tail call <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32> %b, i32 9) + %1 = bitcast <4 x i16> %a to <1 x i64> + %2 = bitcast <4 x i16> %vqshrun to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> + %3 = bitcast <2 x i64> %shuffle.i to <8 x i16> + ret <8 x i16> %3 +} + +define <4 x i32> @test_vqshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) { +; CHECK: test_vqshrun_high_n_s64 +; CHECK: sqshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 + %1 = bitcast <2 x i32> %a to <1 x i64> + %vqshrun = tail call <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64> %b, i32 19) + %2 = bitcast <2 x i32> %vqshrun to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> + %3 = bitcast <2 x i64> %shuffle.i to <4 x i32> + ret <4 x i32> %3 +} + +define <16 x i8> @test_vrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) { +; CHECK: test_vrshrn_high_n_s16 +; CHECK: rshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 + %vrshrn = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %b, i32 3) + %1 = bitcast <8 x i8> %a to <1 x i64> + %2 = bitcast <8 x i8> %vrshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> + %3 = bitcast <2 x i64> %shuffle.i to <16 x i8> + ret <16 x i8> %3 +} + +define <8 x i16> @test_vrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) { +; CHECK: test_vrshrn_high_n_s32 +; CHECK: rshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 + %vrshrn = tail call <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32> %b, i32 9) + %1 = bitcast <4 x i16> %a to <1 x i64> + %2 = bitcast <4 x i16> %vrshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> + %3 = bitcast <2 x i64> %shuffle.i to <8 x i16> + ret <8 x i16> %3 +} + +define <4 x i32> @test_vrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) { +; CHECK: test_vrshrn_high_n_s64 +; CHECK: rshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 + %1 = bitcast <2 x i32> %a to <1 x i64> + %vrshrn = tail call <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64> %b, i32 19) + %2 = bitcast <2 x i32> %vrshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> + %3 = bitcast <2 x i64> %shuffle.i to <4 x i32> + ret <4 x i32> %3 +} + +define <16 x i8> @test_vqrshrun_high_n_s16(<8 x i8> %a, <8 x i16> %b) { +; CHECK: test_vqrshrun_high_n_s16 +; CHECK: sqrshrun2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 + %vqrshrun = tail call <8 x i8> @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16> %b, i32 3) + %1 = bitcast <8 x i8> %a to <1 x i64> + %2 = bitcast <8 x i8> %vqrshrun to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> + %3 = bitcast <2 x i64> %shuffle.i to <16 x i8> + ret <16 x i8> %3 +} + +define <8 x i16> @test_vqrshrun_high_n_s32(<4 x i16> %a, <4 x i32> %b) { +; CHECK: test_vqrshrun_high_n_s32 +; CHECK: sqrshrun2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 + %vqrshrun = tail call <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32> %b, i32 9) + %1 = bitcast <4 x i16> %a to <1 x i64> + %2 = bitcast <4 x i16> %vqrshrun to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> + %3 = bitcast <2 x i64> %shuffle.i to <8 x i16> + ret <8 x i16> %3 +} + +define <4 x i32> @test_vqrshrun_high_n_s64(<2 x i32> %a, <2 x i64> %b) { +; CHECK: test_vqrshrun_high_n_s64 +; CHECK: sqrshrun2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 + %1 = bitcast <2 x i32> %a to <1 x i64> + %vqrshrun = tail call <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64> %b, i32 19) + %2 = bitcast <2 x i32> %vqrshrun to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> + %3 = bitcast <2 x i64> %shuffle.i to <4 x i32> + ret <4 x i32> %3 +} + +define <16 x i8> @test_vqshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) { +; CHECK: test_vqshrn_high_n_s16 +; CHECK: sqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 + %vqshrn = tail call <8 x i8> @llvm.arm64.neon.sqshrn.v8i8(<8 x i16> %b, i32 3) + %1 = bitcast <8 x i8> %a to <1 x i64> + %2 = bitcast <8 x i8> %vqshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> + %3 = bitcast <2 x i64> %shuffle.i to <16 x i8> + ret <16 x i8> %3 +} + +define <8 x i16> @test_vqshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) { +; CHECK: test_vqshrn_high_n_s32 +; CHECK: sqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 + %vqshrn = tail call <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32> %b, i32 9) + %1 = bitcast <4 x i16> %a to <1 x i64> + %2 = bitcast <4 x i16> %vqshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> + %3 = bitcast <2 x i64> %shuffle.i to <8 x i16> + ret <8 x i16> %3 +} + +define <4 x i32> @test_vqshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) { +; CHECK: test_vqshrn_high_n_s64 +; CHECK: sqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 + %1 = bitcast <2 x i32> %a to <1 x i64> + %vqshrn = tail call <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64> %b, i32 19) + %2 = bitcast <2 x i32> %vqshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> + %3 = bitcast <2 x i64> %shuffle.i to <4 x i32> + ret <4 x i32> %3 +} + +define <16 x i8> @test_vqshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) { +; CHECK: test_vqshrn_high_n_u16 +; CHECK: uqshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 + %vqshrn = tail call <8 x i8> @llvm.arm64.neon.uqshrn.v8i8(<8 x i16> %b, i32 3) + %1 = bitcast <8 x i8> %a to <1 x i64> + %2 = bitcast <8 x i8> %vqshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> + %3 = bitcast <2 x i64> %shuffle.i to <16 x i8> + ret <16 x i8> %3 +} + +define <8 x i16> @test_vqshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) { +; CHECK: test_vqshrn_high_n_u32 +; CHECK: uqshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 + %vqshrn = tail call <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32> %b, i32 9) + %1 = bitcast <4 x i16> %a to <1 x i64> + %2 = bitcast <4 x i16> %vqshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> + %3 = bitcast <2 x i64> %shuffle.i to <8 x i16> + ret <8 x i16> %3 +} + +define <4 x i32> @test_vqshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) { +; CHECK: test_vqshrn_high_n_u64 +; CHECK: uqshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 + %1 = bitcast <2 x i32> %a to <1 x i64> + %vqshrn = tail call <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64> %b, i32 19) + %2 = bitcast <2 x i32> %vqshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> + %3 = bitcast <2 x i64> %shuffle.i to <4 x i32> + ret <4 x i32> %3 +} + +define <16 x i8> @test_vqrshrn_high_n_s16(<8 x i8> %a, <8 x i16> %b) { +; CHECK: test_vqrshrn_high_n_s16 +; CHECK: sqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 + %vqrshrn = tail call <8 x i8> @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16> %b, i32 3) + %1 = bitcast <8 x i8> %a to <1 x i64> + %2 = bitcast <8 x i8> %vqrshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> + %3 = bitcast <2 x i64> %shuffle.i to <16 x i8> + ret <16 x i8> %3 +} + +define <8 x i16> @test_vqrshrn_high_n_s32(<4 x i16> %a, <4 x i32> %b) { +; CHECK: test_vqrshrn_high_n_s32 +; CHECK: sqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 + %vqrshrn = tail call <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32> %b, i32 9) + %1 = bitcast <4 x i16> %a to <1 x i64> + %2 = bitcast <4 x i16> %vqrshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> + %3 = bitcast <2 x i64> %shuffle.i to <8 x i16> + ret <8 x i16> %3 +} + +define <4 x i32> @test_vqrshrn_high_n_s64(<2 x i32> %a, <2 x i64> %b) { +; CHECK: test_vqrshrn_high_n_s64 +; CHECK: sqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 + %1 = bitcast <2 x i32> %a to <1 x i64> + %vqrshrn = tail call <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64> %b, i32 19) + %2 = bitcast <2 x i32> %vqrshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> + %3 = bitcast <2 x i64> %shuffle.i to <4 x i32> + ret <4 x i32> %3 +} + +define <16 x i8> @test_vqrshrn_high_n_u16(<8 x i8> %a, <8 x i16> %b) { +; CHECK: test_vqrshrn_high_n_u16 +; CHECK: uqrshrn2 {{v[0-9]+}}.16b, {{v[0-9]+}}.8h, #3 + %vqrshrn = tail call <8 x i8> @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16> %b, i32 3) + %1 = bitcast <8 x i8> %a to <1 x i64> + %2 = bitcast <8 x i8> %vqrshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> + %3 = bitcast <2 x i64> %shuffle.i to <16 x i8> + ret <16 x i8> %3 +} + +define <8 x i16> @test_vqrshrn_high_n_u32(<4 x i16> %a, <4 x i32> %b) { +; CHECK: test_vqrshrn_high_n_u32 +; CHECK: uqrshrn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.4s, #9 + %vqrshrn = tail call <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32> %b, i32 9) + %1 = bitcast <4 x i16> %a to <1 x i64> + %2 = bitcast <4 x i16> %vqrshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> + %3 = bitcast <2 x i64> %shuffle.i to <8 x i16> + ret <8 x i16> %3 +} + +define <4 x i32> @test_vqrshrn_high_n_u64(<2 x i32> %a, <2 x i64> %b) { +; CHECK: test_vqrshrn_high_n_u64 +; CHECK: uqrshrn2 {{v[0-9]+}}.4s, {{v[0-9]+}}.2d, #19 + %1 = bitcast <2 x i32> %a to <1 x i64> + %vqrshrn = tail call <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64> %b, i32 19) + %2 = bitcast <2 x i32> %vqrshrn to <1 x i64> + %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> + %3 = bitcast <2 x i64> %shuffle.i to <4 x i32> + ret <4 x i32> %3 +} + + + +declare <8 x i8> @llvm.arm64.neon.sqshrun.v8i8(<8 x i16>, i32) + +declare <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32>, i32) + +declare <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64>, i32) + +declare <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) + +declare <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32>, i32) + +declare <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64>, i32) + +declare <8 x i8> @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16>, i32) + +declare <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32>, i32) + +declare <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64>, i32) + +declare <8 x i8> @llvm.arm64.neon.sqshrn.v8i8(<8 x i16>, i32) + +declare <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32>, i32) + +declare <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64>, i32) + +declare <8 x i8> @llvm.arm64.neon.uqshrn.v8i8(<8 x i16>, i32) + +declare <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32>, i32) + +declare <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64>, i32) + +declare <8 x i8> @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16>, i32) + +declare <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32>, i32) + +declare <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64>, i32) + +declare <8 x i8> @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16>, i32) + +declare <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32>, i32) + +declare <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64>, i32) + +declare <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) + +declare <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) + +declare <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) + +declare <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) + +declare <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) + +declare <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) + +declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) + +declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) + +declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32) + +declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) + +declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) + +declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32) + +define <1 x i64> @test_vcvt_n_s64_f64(<1 x double> %a) { +; CHECK-LABEL: test_vcvt_n_s64_f64 +; CHECK: fcvtzs d{{[0-9]+}}, d{{[0-9]+}}, #64 + %1 = tail call <1 x i64> @llvm.arm64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double> %a, i32 64) + ret <1 x i64> %1 +} + +define <1 x i64> @test_vcvt_n_u64_f64(<1 x double> %a) { +; CHECK-LABEL: test_vcvt_n_u64_f64 +; CHECK: fcvtzu d{{[0-9]+}}, d{{[0-9]+}}, #64 + %1 = tail call <1 x i64> @llvm.arm64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double> %a, i32 64) + ret <1 x i64> %1 +} + +define <1 x double> @test_vcvt_n_f64_s64(<1 x i64> %a) { +; CHECK-LABEL: test_vcvt_n_f64_s64 +; CHECK: scvtf d{{[0-9]+}}, d{{[0-9]+}}, #64 + %1 = tail call <1 x double> @llvm.arm64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64> %a, i32 64) + ret <1 x double> %1 +} + +define <1 x double> @test_vcvt_n_f64_u64(<1 x i64> %a) { +; CHECK-LABEL: test_vcvt_n_f64_u64 +; CHECK: ucvtf d{{[0-9]+}}, d{{[0-9]+}}, #64 + %1 = tail call <1 x double> @llvm.arm64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64> %a, i32 64) + ret <1 x double> %1 +} + +declare <1 x i64> @llvm.arm64.neon.vcvtfp2fxs.v1i64.v1f64(<1 x double>, i32) +declare <1 x i64> @llvm.arm64.neon.vcvtfp2fxu.v1i64.v1f64(<1 x double>, i32) +declare <1 x double> @llvm.arm64.neon.vcvtfxs2fp.v1f64.v1i64(<1 x i64>, i32) +declare <1 x double> @llvm.arm64.neon.vcvtfxu2fp.v1f64.v1i64(<1 x i64>, i32) diff --git a/test/CodeGen/ARM64/aarch64-neon-simd-vget.ll b/test/CodeGen/ARM64/aarch64-neon-simd-vget.ll new file mode 100644 index 00000000000..87f3956eb20 --- /dev/null +++ b/test/CodeGen/ARM64/aarch64-neon-simd-vget.ll @@ -0,0 +1,225 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <8 x i8> @test_vget_high_s8(<16 x i8> %a) { +; CHECK-LABEL: test_vget_high_s8: +; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8 +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <4 x i16> @test_vget_high_s16(<8 x i16> %a) { +; CHECK-LABEL: test_vget_high_s16: +; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8 +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <2 x i32> @test_vget_high_s32(<4 x i32> %a) { +; CHECK-LABEL: test_vget_high_s32: +; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8 +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> + ret <2 x i32> %shuffle.i +} + +define <1 x i64> @test_vget_high_s64(<2 x i64> %a) { +; CHECK-LABEL: test_vget_high_s64: +; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8 +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> + ret <1 x i64> %shuffle.i +} + +define <8 x i8> @test_vget_high_u8(<16 x i8> %a) { +; CHECK-LABEL: test_vget_high_u8: +; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8 +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <4 x i16> @test_vget_high_u16(<8 x i16> %a) { +; CHECK-LABEL: test_vget_high_u16: +; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8 +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <2 x i32> @test_vget_high_u32(<4 x i32> %a) { +; CHECK-LABEL: test_vget_high_u32: +; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8 +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> + ret <2 x i32> %shuffle.i +} + +define <1 x i64> @test_vget_high_u64(<2 x i64> %a) { +; CHECK-LABEL: test_vget_high_u64: +; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8 +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> + ret <1 x i64> %shuffle.i +} + +define <1 x i64> @test_vget_high_p64(<2 x i64> %a) { +; CHECK-LABEL: test_vget_high_p64: +; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8 +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> + ret <1 x i64> %shuffle.i +} + +define <4 x i16> @test_vget_high_f16(<8 x i16> %a) { +; CHECK-LABEL: test_vget_high_f16: +; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8 +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <2 x float> @test_vget_high_f32(<4 x float> %a) { +; CHECK-LABEL: test_vget_high_f32: +; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8 +entry: + %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> + ret <2 x float> %shuffle.i +} + +define <8 x i8> @test_vget_high_p8(<16 x i8> %a) { +; CHECK-LABEL: test_vget_high_p8: +; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8 +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <4 x i16> @test_vget_high_p16(<8 x i16> %a) { +; CHECK-LABEL: test_vget_high_p16: +; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8 +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <1 x double> @test_vget_high_f64(<2 x double> %a) { +; CHECK-LABEL: test_vget_high_f64: +; CHECK: ext v0.16b, v0.16b, {{v[0-9]+}}.16b, #8 +entry: + %shuffle.i = shufflevector <2 x double> %a, <2 x double> undef, <1 x i32> + ret <1 x double> %shuffle.i +} + +define <8 x i8> @test_vget_low_s8(<16 x i8> %a) { +; CHECK-LABEL: test_vget_low_s8: +; CHECK: ret +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <4 x i16> @test_vget_low_s16(<8 x i16> %a) { +; CHECK-LABEL: test_vget_low_s16: +; CHECK: ret +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <2 x i32> @test_vget_low_s32(<4 x i32> %a) { +; CHECK-LABEL: test_vget_low_s32: +; CHECK: ret +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> + ret <2 x i32> %shuffle.i +} + +define <1 x i64> @test_vget_low_s64(<2 x i64> %a) { +; CHECK-LABEL: test_vget_low_s64: +; CHECK: ret +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer + ret <1 x i64> %shuffle.i +} + +define <8 x i8> @test_vget_low_u8(<16 x i8> %a) { +; CHECK-LABEL: test_vget_low_u8: +; CHECK: ret +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <4 x i16> @test_vget_low_u16(<8 x i16> %a) { +; CHECK-LABEL: test_vget_low_u16: +; CHECK: ret +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <2 x i32> @test_vget_low_u32(<4 x i32> %a) { +; CHECK-LABEL: test_vget_low_u32: +; CHECK: ret +entry: + %shuffle.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> + ret <2 x i32> %shuffle.i +} + +define <1 x i64> @test_vget_low_u64(<2 x i64> %a) { +; CHECK-LABEL: test_vget_low_u64: +; CHECK: ret +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer + ret <1 x i64> %shuffle.i +} + +define <1 x i64> @test_vget_low_p64(<2 x i64> %a) { +; CHECK-LABEL: test_vget_low_p64: +; CHECK: ret +entry: + %shuffle.i = shufflevector <2 x i64> %a, <2 x i64> undef, <1 x i32> zeroinitializer + ret <1 x i64> %shuffle.i +} + +define <4 x i16> @test_vget_low_f16(<8 x i16> %a) { +; CHECK-LABEL: test_vget_low_f16: +; CHECK: ret +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <2 x float> @test_vget_low_f32(<4 x float> %a) { +; CHECK-LABEL: test_vget_low_f32: +; CHECK: ret +entry: + %shuffle.i = shufflevector <4 x float> %a, <4 x float> undef, <2 x i32> + ret <2 x float> %shuffle.i +} + +define <8 x i8> @test_vget_low_p8(<16 x i8> %a) { +; CHECK-LABEL: test_vget_low_p8: +; CHECK: ret +entry: + %shuffle.i = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> + ret <8 x i8> %shuffle.i +} + +define <4 x i16> @test_vget_low_p16(<8 x i16> %a) { +; CHECK-LABEL: test_vget_low_p16: +; CHECK: ret +entry: + %shuffle.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> + ret <4 x i16> %shuffle.i +} + +define <1 x double> @test_vget_low_f64(<2 x double> %a) { +; CHECK-LABEL: test_vget_low_f64: +; CHECK: ret +entry: + %shuffle.i = shufflevector <2 x double> %a, <2 x double> undef, <1 x i32> zeroinitializer + ret <1 x double> %shuffle.i +} diff --git a/test/CodeGen/ARM64/aarch64-neon-vector-list-spill.ll b/test/CodeGen/ARM64/aarch64-neon-vector-list-spill.ll new file mode 100644 index 00000000000..9e69ac025f9 --- /dev/null +++ b/test/CodeGen/ARM64/aarch64-neon-vector-list-spill.ll @@ -0,0 +1,175 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s + +; FIXME: We should not generate ld/st for such register spill/fill, because the +; test case seems very simple and the register pressure is not high. If the +; spill/fill algorithm is optimized, this test case may not be triggered. And +; then we can delete it. +define i32 @spill.DPairReg(i32* %arg1, i32 %arg2) { +; CHECK-LABEL: spill.DPairReg: +; CHECK: ld2 { v{{[0-9]+}}.2s, v{{[0-9]+}}.2s }, [{{x[0-9]+|sp}}] +; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}] +; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}] +entry: + %vld = tail call { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32* %arg1) + %cmp = icmp eq i32 %arg2, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @foo() + br label %if.end + +if.end: + %vld.extract = extractvalue { <2 x i32>, <2 x i32> } %vld, 0 + %res = extractelement <2 x i32> %vld.extract, i32 1 + ret i32 %res +} + +define i16 @spill.DTripleReg(i16* %arg1, i32 %arg2) { +; CHECK-LABEL: spill.DTripleReg: +; CHECK: ld3 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [{{x[0-9]+|sp}}] +; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}] +; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}] +entry: + %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16* %arg1) + %cmp = icmp eq i32 %arg2, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @foo() + br label %if.end + +if.end: + %vld.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld, 0 + %res = extractelement <4 x i16> %vld.extract, i32 1 + ret i16 %res +} + +define i16 @spill.DQuadReg(i16* %arg1, i32 %arg2) { +; CHECK-LABEL: spill.DQuadReg: +; CHECK: ld4 { v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h }, [{{x[0-9]+|sp}}] +; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}] +; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}] +entry: + %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16* %arg1) + %cmp = icmp eq i32 %arg2, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @foo() + br label %if.end + +if.end: + %vld.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld, 0 + %res = extractelement <4 x i16> %vld.extract, i32 0 + ret i16 %res +} + +define i32 @spill.QPairReg(i32* %arg1, i32 %arg2) { +; CHECK-LABEL: spill.QPairReg: +; CHECK: ld2 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}] +; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}] +; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}] +entry: + %vld = tail call { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32* %arg1) + %cmp = icmp eq i32 %arg2, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @foo() + br label %if.end + +if.end: + %vld.extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0 + %res = extractelement <4 x i32> %vld.extract, i32 1 + ret i32 %res +} + +define float @spill.QTripleReg(float* %arg1, i32 %arg2) { +; CHECK-LABEL: spill.QTripleReg: +; CHECK: ld3 { v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s }, [{{x[0-9]+|sp}}] +; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}] +; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}] +entry: + %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float* %arg1) + %cmp = icmp eq i32 %arg2, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @foo() + br label %if.end + +if.end: + %vld3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0 + %res = extractelement <4 x float> %vld3.extract, i32 1 + ret float %res +} + +define i8 @spill.QQuadReg(i8* %arg1, i32 %arg2) { +; CHECK-LABEL: spill.QQuadReg: +; CHECK: ld4 { v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b }, [{{x[0-9]+|sp}}] +; CHECK: st1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}] +; CHECK: ld1 { v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d }, [{{x[0-9]+|sp}}] +entry: + %vld = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8* %arg1) + %cmp = icmp eq i32 %arg2, 0 + br i1 %cmp, label %if.then, label %if.end + +if.then: + tail call void @foo() + br label %if.end + +if.end: + %vld.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld, 0 + %res = extractelement <16 x i8> %vld.extract, i32 1 + ret i8 %res +} + +declare { <2 x i32>, <2 x i32> } @llvm.arm64.neon.ld2.v2i32.p0i32(i32*) +declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld3.v4i16.p0i16(i16*) +declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm64.neon.ld4.v4i16.p0i16(i16*) +declare { <4 x i32>, <4 x i32> } @llvm.arm64.neon.ld2.v4i32.p0i32(i32*) +declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm64.neon.ld3.v4f32.p0f32(float*) +declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0i8(i8*) + +declare void @foo() + +; FIXME: We should not generate ld/st for such register spill/fill, because the +; test case seems very simple and the register pressure is not high. If the +; spill/fill algorithm is optimized, this test case may not be triggered. And +; then we can delete it. +; check the spill for Register Class QPair_with_qsub_0_in_FPR128Lo +define <8 x i16> @test_2xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) { + tail call void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr) + tail call void @foo() + %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> + %1 = bitcast <2 x i64> %sv to <8 x i16> + %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> + %3 = mul <8 x i16> %2, %2 + ret <8 x i16> %3 +} + +; check the spill for Register Class QTriple_with_qsub_0_in_FPR128Lo +define <8 x i16> @test_3xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) { + tail call void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr) + tail call void @foo() + %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> + %1 = bitcast <2 x i64> %sv to <8 x i16> + %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> + %3 = mul <8 x i16> %2, %2 + ret <8 x i16> %3 +} + +; check the spill for Register Class QQuad_with_qsub_0_in_FPR128Lo +define <8 x i16> @test_4xFPR128Lo(i64 %got, i64* %ptr, <1 x i64> %a) { + tail call void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, i64* %ptr) + tail call void @foo() + %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> + %1 = bitcast <2 x i64> %sv to <8 x i16> + %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> + %3 = mul <8 x i16> %2, %2 + ret <8 x i16> %3 +} + +declare void @llvm.arm64.neon.st2lane.v1i64.p0i64(<1 x i64>, <1 x i64>, i64, i64*) +declare void @llvm.arm64.neon.st3lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64, i64*) +declare void @llvm.arm64.neon.st4lane.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, i64*)