llvm-mirror/test/CodeGen/Thumb2/mve-intrinsics/vmovn.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck --check-prefix=LE %s
; RUN: llc -mtriple=thumbebv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck --check-prefix=BE %s

define arm_aapcs_vfpcc <16 x i8> @test_vmovnbq_s16(<16 x i8> %a, <8 x i16> %b) {
; LE-LABEL: test_vmovnbq_s16:
; LE:       @ %bb.0: @ %entry
; LE-NEXT:    vmovnb.i16 q0, q1
; LE-NEXT:    bx lr
;
; BE-LABEL: test_vmovnbq_s16:
; BE:       @ %bb.0: @ %entry
; BE-NEXT:    vrev64.16 q2, q1
; BE-NEXT:    vrev64.8 q1, q0
; BE-NEXT:    vmovnb.i16 q1, q2
; BE-NEXT:    vrev64.8 q0, q1
; BE-NEXT:    bx lr
entry:
  %0 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
  %1 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %0)
  %2 = shufflevector <8 x i16> %b, <8 x i16> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
  %3 = trunc <16 x i16> %2 to <16 x i8>
  ret <16 x i8> %3
}

define arm_aapcs_vfpcc <8 x i16> @test_vmovnbq_s32(<8 x i16> %a, <4 x i32> %b) {
; LE-LABEL: test_vmovnbq_s32:
; LE:       @ %bb.0: @ %entry
; LE-NEXT:    vmovnb.i32 q0, q1
; LE-NEXT:    bx lr
;
; BE-LABEL: test_vmovnbq_s32:
; BE:       @ %bb.0: @ %entry
; BE-NEXT:    vrev64.32 q2, q1
; BE-NEXT:    vrev64.16 q1, q0
; BE-NEXT:    vmovnb.i32 q1, q2
; BE-NEXT:    vrev64.16 q0, q1
; BE-NEXT:    bx lr
entry:
  %0 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
  %1 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %0)
  %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
  %3 = trunc <8 x i32> %2 to <8 x i16>
  ret <8 x i16> %3
}

define arm_aapcs_vfpcc <16 x i8> @test_vmovnbq_u16(<16 x i8> %a, <8 x i16> %b) {
; LE-LABEL: test_vmovnbq_u16:
; LE:       @ %bb.0: @ %entry
; LE-NEXT:    vmovnb.i16 q0, q1
; LE-NEXT:    bx lr
;
; BE-LABEL: test_vmovnbq_u16:
; BE:       @ %bb.0: @ %entry
; BE-NEXT:    vrev64.16 q2, q1
; BE-NEXT:    vrev64.8 q1, q0
; BE-NEXT:    vmovnb.i16 q1, q2
; BE-NEXT:    vrev64.8 q0, q1
; BE-NEXT:    bx lr
entry:
  %0 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
  %1 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %0)
  %2 = shufflevector <8 x i16> %b, <8 x i16> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
  %3 = trunc <16 x i16> %2 to <16 x i8>
  ret <16 x i8> %3
}

define arm_aapcs_vfpcc <8 x i16> @test_vmovnbq_u32(<8 x i16> %a, <4 x i32> %b) {
; LE-LABEL: test_vmovnbq_u32:
; LE:       @ %bb.0: @ %entry
; LE-NEXT:    vmovnb.i32 q0, q1
; LE-NEXT:    bx lr
;
; BE-LABEL: test_vmovnbq_u32:
; BE:       @ %bb.0: @ %entry
; BE-NEXT:    vrev64.32 q2, q1
; BE-NEXT:    vrev64.16 q1, q0
; BE-NEXT:    vmovnb.i32 q1, q2
; BE-NEXT:    vrev64.16 q0, q1
; BE-NEXT:    bx lr
entry:
  %0 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
  %1 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %0)
  %2 = shufflevector <4 x i32> %b, <4 x i32> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
  %3 = trunc <8 x i32> %2 to <8 x i16>
  ret <8 x i16> %3
}

define arm_aapcs_vfpcc <16 x i8> @test_vmovntq_s16(<16 x i8> %a, <8 x i16> %b) {
; LE-LABEL: test_vmovntq_s16:
; LE:       @ %bb.0: @ %entry
; LE-NEXT:    vmovnt.i16 q0, q1
; LE-NEXT:    bx lr
;
; BE-LABEL: test_vmovntq_s16:
; BE:       @ %bb.0: @ %entry
; BE-NEXT:    vrev64.16 q2, q1
; BE-NEXT:    vrev64.8 q1, q0
; BE-NEXT:    vmovnt.i16 q1, q2
; BE-NEXT:    vrev64.8 q0, q1
; BE-NEXT:    bx lr
entry:
  %0 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %a)
  %1 = shufflevector <8 x i16> %0, <8 x i16> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
  %2 = trunc <16 x i16> %1 to <16 x i8>
  ret <16 x i8> %2
}

define arm_aapcs_vfpcc <8 x i16> @test_vmovntq_s32(<8 x i16> %a, <4 x i32> %b) {
; LE-LABEL: test_vmovntq_s32:
; LE:       @ %bb.0: @ %entry
; LE-NEXT:    vmovnt.i32 q0, q1
; LE-NEXT:    bx lr
;
; BE-LABEL: test_vmovntq_s32:
; BE:       @ %bb.0: @ %entry
; BE-NEXT:    vrev64.32 q2, q1
; BE-NEXT:    vrev64.16 q1, q0
; BE-NEXT:    vmovnt.i32 q1, q2
; BE-NEXT:    vrev64.16 q0, q1
; BE-NEXT:    bx lr
entry:
  %0 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %a)
  %1 = shufflevector <4 x i32> %0, <4 x i32> %b, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
  %2 = trunc <8 x i32> %1 to <8 x i16>
  ret <8 x i16> %2
}

define arm_aapcs_vfpcc <16 x i8> @test_vmovntq_u16(<16 x i8> %a, <8 x i16> %b) {
; LE-LABEL: test_vmovntq_u16:
; LE:       @ %bb.0: @ %entry
; LE-NEXT:    vmovnt.i16 q0, q1
; LE-NEXT:    bx lr
;
; BE-LABEL: test_vmovntq_u16:
; BE:       @ %bb.0: @ %entry
; BE-NEXT:    vrev64.16 q2, q1
; BE-NEXT:    vrev64.8 q1, q0
; BE-NEXT:    vmovnt.i16 q1, q2
; BE-NEXT:    vrev64.8 q0, q1
; BE-NEXT:    bx lr
entry:
  %0 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %a)
  %1 = shufflevector <8 x i16> %0, <8 x i16> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
  %2 = trunc <16 x i16> %1 to <16 x i8>
  ret <16 x i8> %2
}

define arm_aapcs_vfpcc <8 x i16> @test_vmovntq_u32(<8 x i16> %a, <4 x i32> %b) {
; LE-LABEL: test_vmovntq_u32:
; LE:       @ %bb.0: @ %entry
; LE-NEXT:    vmovnt.i32 q0, q1
; LE-NEXT:    bx lr
;
; BE-LABEL: test_vmovntq_u32:
; BE:       @ %bb.0: @ %entry
; BE-NEXT:    vrev64.32 q2, q1
; BE-NEXT:    vrev64.16 q1, q0
; BE-NEXT:    vmovnt.i32 q1, q2
; BE-NEXT:    vrev64.16 q0, q1
; BE-NEXT:    bx lr
entry:
  %0 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %a)
  %1 = shufflevector <4 x i32> %0, <4 x i32> %b, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
  %2 = trunc <8 x i32> %1 to <8 x i16>
  ret <8 x i16> %2
}

declare <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8>)
declare <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16>)
[ARM,MVE] Add the vmovnbq,vmovntq intrinsic family. Summary: These are in some sense the inverse of vmovl[bt]q: they take a vector of n wide elements and truncate each to half its width. So they only write half a vector's worth of output data, and therefore they also take an 'inactive' parameter to provide the other half of the data in the output vector. So vmovnb overwrites the even lanes of 'inactive' with the narrowed values from the main input, and vmovnt overwrites the odd lanes. LLVM had existing codegen which generates these MVE instructions in response to IR that takes two vectors of wide elements, or two vectors of narrow ones. But in this case, we have one vector of each. So my clang codegen strategy is to narrow the input vector of wide elements by simply reinterpreting it as the output type, and then we have two narrow vectors and can represent the operation as a vector shuffle that interleaves lanes from both of them. Even so, not all the cases I needed ended up being selected as a single MVE instruction, so I've added a couple more patterns that spot combinations of the 'MVEvmovn' and 'ARMvrev32' SDNodes which can be generated as a VMOVN instruction with operands swapped. This commit adds the unpredicated forms only. Reviewers: dmgreen, miyuki, MarkMurrayARM, ostannard Reviewed By: dmgreen Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D74337 2020-02-17 17:06:20 +00:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
			`; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s \| FileCheck --check-prefix=LE %s`
			`; RUN: llc -mtriple=thumbebv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s \| FileCheck --check-prefix=BE %s`

			`define arm_aapcs_vfpcc <16 x i8> @test_vmovnbq_s16(<16 x i8> %a, <8 x i16> %b) {`
			`; LE-LABEL: test_vmovnbq_s16:`
			`; LE: @ %bb.0: @ %entry`
			`; LE-NEXT: vmovnb.i16 q0, q1`
			`; LE-NEXT: bx lr`
			`;`
			`; BE-LABEL: test_vmovnbq_s16:`
			`; BE: @ %bb.0: @ %entry`
			`; BE-NEXT: vrev64.16 q2, q1`
			`; BE-NEXT: vrev64.8 q1, q0`
			`; BE-NEXT: vmovnb.i16 q1, q2`
			`; BE-NEXT: vrev64.8 q0, q1`
			`; BE-NEXT: bx lr`
			`entry:`
			`%0 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>`
			`%1 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %0)`
			`%2 = shufflevector <8 x i16> %b, <8 x i16> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>`
			`%3 = trunc <16 x i16> %2 to <16 x i8>`
			`ret <16 x i8> %3`
			`}`

			`define arm_aapcs_vfpcc <8 x i16> @test_vmovnbq_s32(<8 x i16> %a, <4 x i32> %b) {`
			`; LE-LABEL: test_vmovnbq_s32:`
			`; LE: @ %bb.0: @ %entry`
			`; LE-NEXT: vmovnb.i32 q0, q1`
			`; LE-NEXT: bx lr`
			`;`
			`; BE-LABEL: test_vmovnbq_s32:`
			`; BE: @ %bb.0: @ %entry`
			`; BE-NEXT: vrev64.32 q2, q1`
			`; BE-NEXT: vrev64.16 q1, q0`
			`; BE-NEXT: vmovnb.i32 q1, q2`
			`; BE-NEXT: vrev64.16 q0, q1`
			`; BE-NEXT: bx lr`
			`entry:`
			`%0 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>`
			`%1 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %0)`
			`%2 = shufflevector <4 x i32> %b, <4 x i32> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>`
			`%3 = trunc <8 x i32> %2 to <8 x i16>`
			`ret <8 x i16> %3`
			`}`

			`define arm_aapcs_vfpcc <16 x i8> @test_vmovnbq_u16(<16 x i8> %a, <8 x i16> %b) {`
			`; LE-LABEL: test_vmovnbq_u16:`
			`; LE: @ %bb.0: @ %entry`
			`; LE-NEXT: vmovnb.i16 q0, q1`
			`; LE-NEXT: bx lr`
			`;`
			`; BE-LABEL: test_vmovnbq_u16:`
			`; BE: @ %bb.0: @ %entry`
			`; BE-NEXT: vrev64.16 q2, q1`
			`; BE-NEXT: vrev64.8 q1, q0`
			`; BE-NEXT: vmovnb.i16 q1, q2`
			`; BE-NEXT: vrev64.8 q0, q1`
			`; BE-NEXT: bx lr`
			`entry:`
			`%0 = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>`
			`%1 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %0)`
			`%2 = shufflevector <8 x i16> %b, <8 x i16> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>`
			`%3 = trunc <16 x i16> %2 to <16 x i8>`
			`ret <16 x i8> %3`
			`}`

			`define arm_aapcs_vfpcc <8 x i16> @test_vmovnbq_u32(<8 x i16> %a, <4 x i32> %b) {`
			`; LE-LABEL: test_vmovnbq_u32:`
			`; LE: @ %bb.0: @ %entry`
			`; LE-NEXT: vmovnb.i32 q0, q1`
			`; LE-NEXT: bx lr`
			`;`
			`; BE-LABEL: test_vmovnbq_u32:`
			`; BE: @ %bb.0: @ %entry`
			`; BE-NEXT: vrev64.32 q2, q1`
			`; BE-NEXT: vrev64.16 q1, q0`
			`; BE-NEXT: vmovnb.i32 q1, q2`
			`; BE-NEXT: vrev64.16 q0, q1`
			`; BE-NEXT: bx lr`
			`entry:`
			`%0 = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>`
			`%1 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %0)`
			`%2 = shufflevector <4 x i32> %b, <4 x i32> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>`
			`%3 = trunc <8 x i32> %2 to <8 x i16>`
			`ret <8 x i16> %3`
			`}`

			`define arm_aapcs_vfpcc <16 x i8> @test_vmovntq_s16(<16 x i8> %a, <8 x i16> %b) {`
			`; LE-LABEL: test_vmovntq_s16:`
			`; LE: @ %bb.0: @ %entry`
			`; LE-NEXT: vmovnt.i16 q0, q1`
			`; LE-NEXT: bx lr`
			`;`
			`; BE-LABEL: test_vmovntq_s16:`
			`; BE: @ %bb.0: @ %entry`
			`; BE-NEXT: vrev64.16 q2, q1`
			`; BE-NEXT: vrev64.8 q1, q0`
			`; BE-NEXT: vmovnt.i16 q1, q2`
			`; BE-NEXT: vrev64.8 q0, q1`
			`; BE-NEXT: bx lr`
			`entry:`
			`%0 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %a)`
			`%1 = shufflevector <8 x i16> %0, <8 x i16> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>`
			`%2 = trunc <16 x i16> %1 to <16 x i8>`
			`ret <16 x i8> %2`
			`}`

			`define arm_aapcs_vfpcc <8 x i16> @test_vmovntq_s32(<8 x i16> %a, <4 x i32> %b) {`
			`; LE-LABEL: test_vmovntq_s32:`
			`; LE: @ %bb.0: @ %entry`
			`; LE-NEXT: vmovnt.i32 q0, q1`
			`; LE-NEXT: bx lr`
			`;`
			`; BE-LABEL: test_vmovntq_s32:`
			`; BE: @ %bb.0: @ %entry`
			`; BE-NEXT: vrev64.32 q2, q1`
			`; BE-NEXT: vrev64.16 q1, q0`
			`; BE-NEXT: vmovnt.i32 q1, q2`
			`; BE-NEXT: vrev64.16 q0, q1`
			`; BE-NEXT: bx lr`
			`entry:`
			`%0 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %a)`
			`%1 = shufflevector <4 x i32> %0, <4 x i32> %b, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>`
			`%2 = trunc <8 x i32> %1 to <8 x i16>`
			`ret <8 x i16> %2`
			`}`

			`define arm_aapcs_vfpcc <16 x i8> @test_vmovntq_u16(<16 x i8> %a, <8 x i16> %b) {`
			`; LE-LABEL: test_vmovntq_u16:`
			`; LE: @ %bb.0: @ %entry`
			`; LE-NEXT: vmovnt.i16 q0, q1`
			`; LE-NEXT: bx lr`
			`;`
			`; BE-LABEL: test_vmovntq_u16:`
			`; BE: @ %bb.0: @ %entry`
			`; BE-NEXT: vrev64.16 q2, q1`
			`; BE-NEXT: vrev64.8 q1, q0`
			`; BE-NEXT: vmovnt.i16 q1, q2`
			`; BE-NEXT: vrev64.8 q0, q1`
			`; BE-NEXT: bx lr`
			`entry:`
			`%0 = tail call <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8> %a)`
			`%1 = shufflevector <8 x i16> %0, <8 x i16> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>`
			`%2 = trunc <16 x i16> %1 to <16 x i8>`
			`ret <16 x i8> %2`
			`}`

			`define arm_aapcs_vfpcc <8 x i16> @test_vmovntq_u32(<8 x i16> %a, <4 x i32> %b) {`
			`; LE-LABEL: test_vmovntq_u32:`
			`; LE: @ %bb.0: @ %entry`
			`; LE-NEXT: vmovnt.i32 q0, q1`
			`; LE-NEXT: bx lr`
			`;`
			`; BE-LABEL: test_vmovntq_u32:`
			`; BE: @ %bb.0: @ %entry`
			`; BE-NEXT: vrev64.32 q2, q1`
			`; BE-NEXT: vrev64.16 q1, q0`
			`; BE-NEXT: vmovnt.i32 q1, q2`
			`; BE-NEXT: vrev64.16 q0, q1`
			`; BE-NEXT: bx lr`
			`entry:`
			`%0 = tail call <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16> %a)`
			`%1 = shufflevector <4 x i32> %0, <4 x i32> %b, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>`
			`%2 = trunc <8 x i32> %1 to <8 x i16>`
			`ret <8 x i16> %2`
			`}`

			`declare <8 x i16> @llvm.arm.mve.vreinterpretq.v8i16.v16i8(<16 x i8>)`
			`declare <4 x i32> @llvm.arm.mve.vreinterpretq.v4i32.v8i16(<8 x i16>)`