1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 11:13:28 +01:00
llvm-mirror/test/CodeGen/Thumb2/mve-intrinsics/idup.ll
Simon Tatham e3f9be3c6f [ARM,MVE] Add intrinsics for v[id]dupq and v[id]wdupq.
Summary:
These instructions generate a vector of consecutive elements starting
from a given base value and incrementing by 1, 2, 4 or 8. The `wdup`
versions also wrap the values back to zero when they reach a given
limit value. The instruction updates the scalar base register so that
another use of the same instruction will continue the sequence from
where the previous one left off.

At the IR level, I've represented these instructions as a family of
target-specific intrinsics with two return values (the constructed
vector and the updated base). The user-facing ACLE API provides a set
of intrinsics that throw away the written-back base and another set
that receive it as a pointer so they can update it, plus the usual
predicated versions.

Because the intrinsics return two values (as do the underlying
instructions), the isel has to be done in C++.

This is the first family of MVE intrinsics that use the `imm_1248`
immediate type in the clang Tablegen framework, so naturally, I found
I'd given it the wrong C integer type. Also added some tests of the
check that the immediate has a legal value, because this is the first
time those particular checks have been exercised.

Finally, I also had to fix a bug in MveEmitter which failed an
assertion when I nested two `seq` nodes (the inner one used to extract
the two values from the pair returned by the IR intrinsic, and the
outer one put on by the predication multiclass).

Reviewers: dmgreen, MarkMurrayARM, miyuki, ostannard

Reviewed By: dmgreen

Subscribers: kristof.beyls, hiraditya, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D73357
2020-02-03 11:20:06 +00:00

776 lines
29 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s
define arm_aapcs_vfpcc <16 x i8> @test_vidupq_n_u8(i32 %a) {
; CHECK-LABEL: test_vidupq_n_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vidup.u8 q0, r0, #4
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32 %a, i32 4)
%1 = extractvalue { <16 x i8>, i32 } %0, 0
ret <16 x i8> %1
}
define arm_aapcs_vfpcc <8 x i16> @test_vidupq_n_u16(i32 %a) {
; CHECK-LABEL: test_vidupq_n_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vidup.u16 q0, r0, #1
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32 %a, i32 1)
%1 = extractvalue { <8 x i16>, i32 } %0, 0
ret <8 x i16> %1
}
define arm_aapcs_vfpcc <4 x i32> @test_vidupq_n_u32(i32 %a) {
; CHECK-LABEL: test_vidupq_n_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vidup.u32 q0, r0, #4
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 %a, i32 4)
%1 = extractvalue { <4 x i32>, i32 } %0, 0
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <16 x i8> @test_vddupq_n_u8(i32 %a) {
; CHECK-LABEL: test_vddupq_n_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vddup.u8 q0, r0, #2
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.v16i8(i32 %a, i32 2)
%1 = extractvalue { <16 x i8>, i32 } %0, 0
ret <16 x i8> %1
}
define arm_aapcs_vfpcc <8 x i16> @test_vddupq_n_u16(i32 %a) {
; CHECK-LABEL: test_vddupq_n_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vddup.u16 q0, r0, #4
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.v8i16(i32 %a, i32 4)
%1 = extractvalue { <8 x i16>, i32 } %0, 0
ret <8 x i16> %1
}
define arm_aapcs_vfpcc <4 x i32> @test_vddupq_n_u32(i32 %a) {
; CHECK-LABEL: test_vddupq_n_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vddup.u32 q0, r0, #2
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.v4i32(i32 %a, i32 2)
%1 = extractvalue { <4 x i32>, i32 } %0, 0
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_n_u8(i32 %a, i32 %b) {
; CHECK-LABEL: test_viwdupq_n_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: viwdup.u8 q0, r0, r1, #4
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.v16i8(i32 %a, i32 %b, i32 4)
%1 = extractvalue { <16 x i8>, i32 } %0, 0
ret <16 x i8> %1
}
define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_n_u16(i32 %a, i32 %b) {
; CHECK-LABEL: test_viwdupq_n_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: viwdup.u16 q0, r0, r1, #2
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.v8i16(i32 %a, i32 %b, i32 2)
%1 = extractvalue { <8 x i16>, i32 } %0, 0
ret <8 x i16> %1
}
define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_n_u32(i32 %a, i32 %b) {
; CHECK-LABEL: test_viwdupq_n_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: viwdup.u32 q0, r0, r1, #8
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32 %a, i32 %b, i32 8)
%1 = extractvalue { <4 x i32>, i32 } %0, 0
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_n_u8(i32 %a, i32 %b) {
; CHECK-LABEL: test_vdwdupq_n_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vdwdup.u8 q0, r0, r1, #4
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.v16i8(i32 %a, i32 %b, i32 4)
%1 = extractvalue { <16 x i8>, i32 } %0, 0
ret <16 x i8> %1
}
define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_n_u16(i32 %a, i32 %b) {
; CHECK-LABEL: test_vdwdupq_n_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vdwdup.u16 q0, r0, r1, #8
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.v8i16(i32 %a, i32 %b, i32 8)
%1 = extractvalue { <8 x i16>, i32 } %0, 0
ret <8 x i16> %1
}
define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_n_u32(i32 %a, i32 %b) {
; CHECK-LABEL: test_vdwdupq_n_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vdwdup.u32 q0, r0, r1, #1
; CHECK-NEXT: bx lr
entry:
%0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.v4i32(i32 %a, i32 %b, i32 1)
%1 = extractvalue { <4 x i32>, i32 } %0, 0
ret <4 x i32> %1
}
define arm_aapcs_vfpcc <16 x i8> @test_vidupq_wb_u8(i32* nocapture %a) {
; CHECK-LABEL: test_vidupq_wb_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vidup.u8 q0, r2, #8
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32 %0, i32 8)
%2 = extractvalue { <16 x i8>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <16 x i8>, i32 } %1, 0
ret <16 x i8> %3
}
define arm_aapcs_vfpcc <8 x i16> @test_vidupq_wb_u16(i32* nocapture %a) {
; CHECK-LABEL: test_vidupq_wb_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vidup.u16 q0, r2, #1
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32 %0, i32 1)
%2 = extractvalue { <8 x i16>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <8 x i16>, i32 } %1, 0
ret <8 x i16> %3
}
define arm_aapcs_vfpcc <4 x i32> @test_vidupq_wb_u32(i32* nocapture %a) {
; CHECK-LABEL: test_vidupq_wb_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vidup.u32 q0, r2, #4
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 %0, i32 4)
%2 = extractvalue { <4 x i32>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <4 x i32>, i32 } %1, 0
ret <4 x i32> %3
}
define arm_aapcs_vfpcc <16 x i8> @test_vddupq_wb_u8(i32* nocapture %a) {
; CHECK-LABEL: test_vddupq_wb_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vddup.u8 q0, r2, #2
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.v16i8(i32 %0, i32 2)
%2 = extractvalue { <16 x i8>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <16 x i8>, i32 } %1, 0
ret <16 x i8> %3
}
define arm_aapcs_vfpcc <8 x i16> @test_vddupq_wb_u16(i32* nocapture %a) {
; CHECK-LABEL: test_vddupq_wb_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vddup.u16 q0, r2, #8
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.v8i16(i32 %0, i32 8)
%2 = extractvalue { <8 x i16>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <8 x i16>, i32 } %1, 0
ret <8 x i16> %3
}
define arm_aapcs_vfpcc <4 x i32> @test_vddupq_wb_u32(i32* nocapture %a) {
; CHECK-LABEL: test_vddupq_wb_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vddup.u32 q0, r2, #2
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.v4i32(i32 %0, i32 2)
%2 = extractvalue { <4 x i32>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <4 x i32>, i32 } %1, 0
ret <4 x i32> %3
}
define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_wb_u8(i32* nocapture %a, i32 %b) {
; CHECK-LABEL: test_vdwdupq_wb_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vdwdup.u8 q0, r2, r1, #4
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.v16i8(i32 %0, i32 %b, i32 4)
%2 = extractvalue { <16 x i8>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <16 x i8>, i32 } %1, 0
ret <16 x i8> %3
}
define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_wb_u16(i32* nocapture %a, i32 %b) {
; CHECK-LABEL: test_vdwdupq_wb_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vdwdup.u16 q0, r2, r1, #4
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.v8i16(i32 %0, i32 %b, i32 4)
%2 = extractvalue { <8 x i16>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <8 x i16>, i32 } %1, 0
ret <8 x i16> %3
}
define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_wb_u8(i32* nocapture %a, i32 %b) {
; CHECK-LABEL: test_viwdupq_wb_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: viwdup.u8 q0, r2, r1, #1
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.v16i8(i32 %0, i32 %b, i32 1)
%2 = extractvalue { <16 x i8>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <16 x i8>, i32 } %1, 0
ret <16 x i8> %3
}
define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_wb_u16(i32* nocapture %a, i32 %b) {
; CHECK-LABEL: test_viwdupq_wb_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: viwdup.u16 q0, r2, r1, #1
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.v8i16(i32 %0, i32 %b, i32 1)
%2 = extractvalue { <8 x i16>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <8 x i16>, i32 } %1, 0
ret <8 x i16> %3
}
define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_wb_u32(i32* nocapture %a, i32 %b) {
; CHECK-LABEL: test_viwdupq_wb_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: viwdup.u32 q0, r2, r1, #8
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32 %0, i32 %b, i32 8)
%2 = extractvalue { <4 x i32>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <4 x i32>, i32 } %1, 0
ret <4 x i32> %3
}
define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_wb_u32(i32* nocapture %a, i32 %b) {
; CHECK-LABEL: test_vdwdupq_wb_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vdwdup.u32 q0, r2, r1, #2
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.v4i32(i32 %0, i32 %b, i32 2)
%2 = extractvalue { <4 x i32>, i32 } %1, 1
store i32 %2, i32* %a, align 4
%3 = extractvalue { <4 x i32>, i32 } %1, 0
ret <4 x i32> %3
}
define arm_aapcs_vfpcc <16 x i8> @test_vidupq_m_n_u8(<16 x i8> %inactive, i32 %a, i16 zeroext %p) {
; CHECK-LABEL: test_vidupq_m_n_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vidupt.u8 q0, r0, #8
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
%2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 8, <16 x i1> %1)
%3 = extractvalue { <16 x i8>, i32 } %2, 0
ret <16 x i8> %3
}
define arm_aapcs_vfpcc <8 x i16> @test_vidupq_m_n_u16(<8 x i16> %inactive, i32 %a, i16 zeroext %p) {
; CHECK-LABEL: test_vidupq_m_n_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vidupt.u16 q0, r0, #8
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
%2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 8, <8 x i1> %1)
%3 = extractvalue { <8 x i16>, i32 } %2, 0
ret <8 x i16> %3
}
define arm_aapcs_vfpcc <4 x i32> @test_vidupq_m_n_u32(<4 x i32> %inactive, i32 %a, i16 zeroext %p) {
; CHECK-LABEL: test_vidupq_m_n_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vidupt.u32 q0, r0, #2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 2, <4 x i1> %1)
%3 = extractvalue { <4 x i32>, i32 } %2, 0
ret <4 x i32> %3
}
define arm_aapcs_vfpcc <16 x i8> @test_vddupq_m_n_u8(<16 x i8> %inactive, i32 %a, i16 zeroext %p) {
; CHECK-LABEL: test_vddupq_m_n_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vddupt.u8 q0, r0, #8
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
%2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 8, <16 x i1> %1)
%3 = extractvalue { <16 x i8>, i32 } %2, 0
ret <16 x i8> %3
}
define arm_aapcs_vfpcc <8 x i16> @test_vddupq_m_n_u16(<8 x i16> %inactive, i32 %a, i16 zeroext %p) {
; CHECK-LABEL: test_vddupq_m_n_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vddupt.u16 q0, r0, #2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
%2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 2, <8 x i1> %1)
%3 = extractvalue { <8 x i16>, i32 } %2, 0
ret <8 x i16> %3
}
define arm_aapcs_vfpcc <4 x i32> @test_vddupq_m_n_u32(<4 x i32> %inactive, i32 %a, i16 zeroext %p) {
; CHECK-LABEL: test_vddupq_m_n_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vddupt.u32 q0, r0, #8
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 8, <4 x i1> %1)
%3 = extractvalue { <4 x i32>, i32 } %2, 0
ret <4 x i32> %3
}
define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_m_n_u8(<16 x i8> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_viwdupq_m_n_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: viwdupt.u8 q0, r0, r1, #8
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
%2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 %b, i32 8, <16 x i1> %1)
%3 = extractvalue { <16 x i8>, i32 } %2, 0
ret <16 x i8> %3
}
define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_m_n_u16(<8 x i16> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_viwdupq_m_n_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: viwdupt.u16 q0, r0, r1, #8
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
%2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 %b, i32 8, <8 x i1> %1)
%3 = extractvalue { <8 x i16>, i32 } %2, 0
ret <8 x i16> %3
}
define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_m_n_u32(<4 x i32> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_viwdupq_m_n_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: viwdupt.u32 q0, r0, r1, #4
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 %b, i32 4, <4 x i1> %1)
%3 = extractvalue { <4 x i32>, i32 } %2, 0
ret <4 x i32> %3
}
define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_m_n_u8(<16 x i8> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_vdwdupq_m_n_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: vdwdupt.u8 q0, r0, r1, #1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
%2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 %b, i32 1, <16 x i1> %1)
%3 = extractvalue { <16 x i8>, i32 } %2, 0
ret <16 x i8> %3
}
define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_m_n_u16(<8 x i16> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_vdwdupq_m_n_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: vdwdupt.u16 q0, r0, r1, #2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
%2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 %b, i32 2, <8 x i1> %1)
%3 = extractvalue { <8 x i16>, i32 } %2, 0
ret <8 x i16> %3
}
define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_m_n_u32(<4 x i32> %inactive, i32 %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_vdwdupq_m_n_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: vdwdupt.u32 q0, r0, r1, #4
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 %b, i32 4, <4 x i1> %1)
%3 = extractvalue { <4 x i32>, i32 } %2, 0
ret <4 x i32> %3
}
define arm_aapcs_vfpcc <16 x i8> @test_vidupq_m_wb_u8(<16 x i8> %inactive, i32* nocapture %a, i16 zeroext %p) {
; CHECK-LABEL: test_vidupq_m_wb_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vidupt.u8 q0, r2, #8
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
%3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 8, <16 x i1> %2)
%4 = extractvalue { <16 x i8>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <16 x i8>, i32 } %3, 0
ret <16 x i8> %5
}
define arm_aapcs_vfpcc <8 x i16> @test_vidupq_m_wb_u16(<8 x i16> %inactive, i32* nocapture %a, i16 zeroext %p) {
; CHECK-LABEL: test_vidupq_m_wb_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vidupt.u16 q0, r2, #2
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
%3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 2, <8 x i1> %2)
%4 = extractvalue { <8 x i16>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <8 x i16>, i32 } %3, 0
ret <8 x i16> %5
}
define arm_aapcs_vfpcc <4 x i32> @test_vidupq_m_wb_u32(<4 x i32> %inactive, i32* nocapture %a, i16 zeroext %p) {
; CHECK-LABEL: test_vidupq_m_wb_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vidupt.u32 q0, r2, #8
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
%3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 8, <4 x i1> %2)
%4 = extractvalue { <4 x i32>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <4 x i32>, i32 } %3, 0
ret <4 x i32> %5
}
define arm_aapcs_vfpcc <16 x i8> @test_vddupq_m_wb_u8(<16 x i8> %inactive, i32* nocapture %a, i16 zeroext %p) {
; CHECK-LABEL: test_vddupq_m_wb_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vddupt.u8 q0, r2, #1
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
%3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 1, <16 x i1> %2)
%4 = extractvalue { <16 x i8>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <16 x i8>, i32 } %3, 0
ret <16 x i8> %5
}
define arm_aapcs_vfpcc <8 x i16> @test_vddupq_m_wb_u16(<8 x i16> %inactive, i32* nocapture %a, i16 zeroext %p) {
; CHECK-LABEL: test_vddupq_m_wb_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vddupt.u16 q0, r2, #1
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
%3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 1, <8 x i1> %2)
%4 = extractvalue { <8 x i16>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <8 x i16>, i32 } %3, 0
ret <8 x i16> %5
}
define arm_aapcs_vfpcc <4 x i32> @test_vddupq_m_wb_u32(<4 x i32> %inactive, i32* nocapture %a, i16 zeroext %p) {
; CHECK-LABEL: test_vddupq_m_wb_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr r2, [r0]
; CHECK-NEXT: vmsr p0, r1
; CHECK-NEXT: vpst
; CHECK-NEXT: vddupt.u32 q0, r2, #4
; CHECK-NEXT: str r2, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
%3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 4, <4 x i1> %2)
%4 = extractvalue { <4 x i32>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <4 x i32>, i32 } %3, 0
ret <4 x i32> %5
}
define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_m_wb_u8(<16 x i8> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_viwdupq_m_wb_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr.w r12, [r0]
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: viwdupt.u8 q0, r12, r1, #8
; CHECK-NEXT: str.w r12, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
%3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 %b, i32 8, <16 x i1> %2)
%4 = extractvalue { <16 x i8>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <16 x i8>, i32 } %3, 0
ret <16 x i8> %5
}
define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_m_wb_u16(<8 x i16> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_viwdupq_m_wb_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr.w r12, [r0]
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: viwdupt.u16 q0, r12, r1, #8
; CHECK-NEXT: str.w r12, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
%3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 %b, i32 8, <8 x i1> %2)
%4 = extractvalue { <8 x i16>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <8 x i16>, i32 } %3, 0
ret <8 x i16> %5
}
define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_m_wb_u32(<4 x i32> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_viwdupq_m_wb_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr.w r12, [r0]
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: viwdupt.u32 q0, r12, r1, #4
; CHECK-NEXT: str.w r12, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
%3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 %b, i32 4, <4 x i1> %2)
%4 = extractvalue { <4 x i32>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <4 x i32>, i32 } %3, 0
ret <4 x i32> %5
}
define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_m_wb_u8(<16 x i8> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_vdwdupq_m_wb_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr.w r12, [r0]
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: vdwdupt.u8 q0, r12, r1, #1
; CHECK-NEXT: str.w r12, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1)
%3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 %b, i32 1, <16 x i1> %2)
%4 = extractvalue { <16 x i8>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <16 x i8>, i32 } %3, 0
ret <16 x i8> %5
}
define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_m_wb_u16(<8 x i16> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_vdwdupq_m_wb_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr.w r12, [r0]
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: vdwdupt.u16 q0, r12, r1, #4
; CHECK-NEXT: str.w r12, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1)
%3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 %b, i32 4, <8 x i1> %2)
%4 = extractvalue { <8 x i16>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <8 x i16>, i32 } %3, 0
ret <8 x i16> %5
}
define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_m_wb_u32(<4 x i32> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) {
; CHECK-LABEL: test_vdwdupq_m_wb_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldr.w r12, [r0]
; CHECK-NEXT: vmsr p0, r2
; CHECK-NEXT: vpst
; CHECK-NEXT: vdwdupt.u32 q0, r12, r1, #4
; CHECK-NEXT: str.w r12, [r0]
; CHECK-NEXT: bx lr
entry:
%0 = load i32, i32* %a, align 4
%1 = zext i16 %p to i32
%2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1)
%3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 %b, i32 4, <4 x i1> %2)
%4 = extractvalue { <4 x i32>, i32 } %3, 1
store i32 %4, i32* %a, align 4
%5 = extractvalue { <4 x i32>, i32 } %3, 0
ret <4 x i32> %5
}
declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
declare { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32, i32)
declare { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32, i32)
declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32)
declare { <16 x i8>, i32 } @llvm.arm.mve.vddup.v16i8(i32, i32)
declare { <8 x i16>, i32 } @llvm.arm.mve.vddup.v8i16(i32, i32)
declare { <4 x i32>, i32 } @llvm.arm.mve.vddup.v4i32(i32, i32)
declare { <16 x i8>, i32 } @llvm.arm.mve.viwdup.v16i8(i32, i32, i32)
declare { <8 x i16>, i32 } @llvm.arm.mve.viwdup.v8i16(i32, i32, i32)
declare { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32, i32, i32)
declare { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.v16i8(i32, i32, i32)
declare { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.v8i16(i32, i32, i32)
declare { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.v4i32(i32, i32, i32)
declare { <16 x i8>, i32 } @llvm.arm.mve.vidup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, <16 x i1>)
declare { <8 x i16>, i32 } @llvm.arm.mve.vidup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, <8 x i1>)
declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, <4 x i1>)
declare { <16 x i8>, i32 } @llvm.arm.mve.vddup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, <16 x i1>)
declare { <8 x i16>, i32 } @llvm.arm.mve.vddup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, <8 x i1>)
declare { <4 x i32>, i32 } @llvm.arm.mve.vddup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, <4 x i1>)
declare { <16 x i8>, i32 } @llvm.arm.mve.viwdup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, i32, <16 x i1>)
declare { <8 x i16>, i32 } @llvm.arm.mve.viwdup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, i32, <8 x i1>)
declare { <4 x i32>, i32 } @llvm.arm.mve.viwdup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, i32, <4 x i1>)
declare { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, i32, <16 x i1>)
declare { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, i32, <8 x i1>)
declare { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, i32, <4 x i1>)