1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-26 12:43:36 +01:00

[ARM][MVE] Add intrinsics for immediate shifts. (reland)

This adds the family of `vshlq_n` and `vshrq_n` ACLE intrinsics, which
shift every lane of a vector left or right by a compile-time
immediate. They mostly work by expanding to the IR `shl`, `lshr` and
`ashr` operations, with their second operand being a vector splat of
the immediate.

There's a fiddly special case, though. ACLE specifies that the
immediate in `vshrq_n` can take values up to //and including// the bit
size of the vector lane. But LLVM IR thinks that shifting right by the
full size of the lane is UB, and feels free to replace the `lshr` with
an `undef` half way through the optimization pipeline. Hence, to keep
this legal in source code, I have to detect it at codegen time.
Logical (unsigned) right shifts by the element size are handled by
simply emitting the zero vector; arithmetic ones are converted into a
shift of one bit less, which will always give the same output.

In order to do that check, I also had to enhance the tablegen
MveEmitter so that it can cope with converting a builtin function's
operand into a bare integer to pass to a code-generating subfunction.
Previously the only bare integers it knew how to handle were flags
generated from within `arm_mve.td`.

Reviewers: dmgreen, miyuki, MarkMurrayARM, ostannard

Reviewed By: dmgreen, MarkMurrayARM

Subscribers: echristo, hokein, rdhindsa, kristof.beyls, hiraditya, cfe-commits, llvm-commits

Tags: #clang, #llvm

Differential Revision: https://reviews.llvm.org/D71065
This commit is contained in:
Simon Tatham 2019-12-11 10:04:14 +00:00
parent 874a0827bb
commit 4dbfeb87c3
3 changed files with 438 additions and 20 deletions

View File

@ -913,6 +913,14 @@ defm int_arm_mve_vstr_scatter_offset: MVEPredicated<
[], [llvm_anyptr_ty, llvm_anyvector_ty, llvm_anyvector_ty,
llvm_i32_ty, llvm_i32_ty], llvm_anyvector_ty, [IntrWriteMem]>;
def int_arm_mve_shl_imm_predicated: Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>],
[IntrNoMem]>;
def int_arm_mve_shr_imm_predicated: Intrinsic<[llvm_anyvector_ty],
[LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty, // extra i32 is unsigned flag
llvm_anyvector_ty, LLVMMatchType<0>],
[IntrNoMem]>;
// MVE scalar shifts.
class ARM_MVE_qrshift_single<list<LLVMType> value,
list<LLVMType> saturate = []> :

View File

@ -2816,27 +2816,39 @@ def MVE_VSHL_immi32 : MVE_VSHL_imm<"i32", (ins imm0_31:$imm)> {
let Inst{21} = 0b1;
}
multiclass MVE_immediate_shift_patterns_inner<
MVEVectorVTInfo VTI, Operand imm_operand_type, SDNode unpred_op,
Intrinsic pred_int, Instruction inst, list<int> unsignedFlag = []> {
def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$src), imm_operand_type:$imm)),
(VTI.Vec (inst (VTI.Vec MQPR:$src), imm_operand_type:$imm))>;
def : Pat<(VTI.Vec !con((pred_int (VTI.Vec MQPR:$src), imm_operand_type:$imm),
!dag(pred_int, unsignedFlag, ?),
(pred_int (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))),
(VTI.Vec (inst (VTI.Vec MQPR:$src), imm_operand_type:$imm,
ARMVCCThen, (VTI.Pred VCCR:$mask),
(VTI.Vec MQPR:$inactive)))>;
}
multiclass MVE_immediate_shift_patterns<MVEVectorVTInfo VTI,
Operand imm_operand_type> {
defm : MVE_immediate_shift_patterns_inner<VTI, imm_operand_type,
ARMvshlImm, int_arm_mve_shl_imm_predicated,
!cast<Instruction>("MVE_VSHL_immi" # VTI.BitsSuffix)>;
defm : MVE_immediate_shift_patterns_inner<VTI, imm_operand_type,
ARMvshruImm, int_arm_mve_shr_imm_predicated,
!cast<Instruction>("MVE_VSHR_immu" # VTI.BitsSuffix), [1]>;
defm : MVE_immediate_shift_patterns_inner<VTI, imm_operand_type,
ARMvshrsImm, int_arm_mve_shr_imm_predicated,
!cast<Instruction>("MVE_VSHR_imms" # VTI.BitsSuffix), [0]>;
}
let Predicates = [HasMVEInt] in {
def : Pat<(v4i32 (ARMvshlImm (v4i32 MQPR:$src), imm0_31:$imm)),
(v4i32 (MVE_VSHL_immi32 (v4i32 MQPR:$src), imm0_31:$imm))>;
def : Pat<(v8i16 (ARMvshlImm (v8i16 MQPR:$src), imm0_15:$imm)),
(v8i16 (MVE_VSHL_immi16 (v8i16 MQPR:$src), imm0_15:$imm))>;
def : Pat<(v16i8 (ARMvshlImm (v16i8 MQPR:$src), imm0_7:$imm)),
(v16i8 (MVE_VSHL_immi8 (v16i8 MQPR:$src), imm0_7:$imm))>;
def : Pat<(v4i32 (ARMvshruImm (v4i32 MQPR:$src), imm0_31:$imm)),
(v4i32 (MVE_VSHR_immu32 (v4i32 MQPR:$src), imm0_31:$imm))>;
def : Pat<(v8i16 (ARMvshruImm (v8i16 MQPR:$src), imm0_15:$imm)),
(v8i16 (MVE_VSHR_immu16 (v8i16 MQPR:$src), imm0_15:$imm))>;
def : Pat<(v16i8 (ARMvshruImm (v16i8 MQPR:$src), imm0_7:$imm)),
(v16i8 (MVE_VSHR_immu8 (v16i8 MQPR:$src), imm0_7:$imm))>;
def : Pat<(v4i32 (ARMvshrsImm (v4i32 MQPR:$src), imm0_31:$imm)),
(v4i32 (MVE_VSHR_imms32 (v4i32 MQPR:$src), imm0_31:$imm))>;
def : Pat<(v8i16 (ARMvshrsImm (v8i16 MQPR:$src), imm0_15:$imm)),
(v8i16 (MVE_VSHR_imms16 (v8i16 MQPR:$src), imm0_15:$imm))>;
def : Pat<(v16i8 (ARMvshrsImm (v16i8 MQPR:$src), imm0_7:$imm)),
(v16i8 (MVE_VSHR_imms8 (v16i8 MQPR:$src), imm0_7:$imm))>;
defm : MVE_immediate_shift_patterns<MVE_v16i8, imm0_7>;
defm : MVE_immediate_shift_patterns<MVE_v8i16, imm0_15>;
defm : MVE_immediate_shift_patterns<MVE_v4i32, imm0_31>;
}
// end of mve_shift instructions

View File

@ -0,0 +1,398 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s
define arm_aapcs_vfpcc <16 x i8> @test_vshlq_n_s8(<16 x i8> %a) {
; CHECK-LABEL: test_vshlq_n_s8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vshl.i8 q0, q0, #5
; CHECK-NEXT: bx lr
entry:
%0 = shl <16 x i8> %a, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
ret <16 x i8> %0
}
define arm_aapcs_vfpcc <8 x i16> @test_vshlq_n_s16(<8 x i16> %a) {
; CHECK-LABEL: test_vshlq_n_s16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vshl.i16 q0, q0, #5
; CHECK-NEXT: bx lr
entry:
%0 = shl <8 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
ret <8 x i16> %0
}
define arm_aapcs_vfpcc <4 x i32> @test_vshlq_n_s32(<4 x i32> %a) {
; CHECK-LABEL: test_vshlq_n_s32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vshl.i32 q0, q0, #18
; CHECK-NEXT: bx lr
entry:
%0 = shl <4 x i32> %a, <i32 18, i32 18, i32 18, i32 18>
ret <4 x i32> %0
}
define arm_aapcs_vfpcc <16 x i8> @test_vshrq_n_s8(<16 x i8> %a) {
; CHECK-LABEL: test_vshrq_n_s8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vshr.s8 q0, q0, #4
; CHECK-NEXT: bx lr
entry:
%0 = ashr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
ret <16 x i8> %0
}
define arm_aapcs_vfpcc <8 x i16> @test_vshrq_n_s16(<8 x i16> %a) {
; CHECK-LABEL: test_vshrq_n_s16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vshr.s16 q0, q0, #10
; CHECK-NEXT: bx lr
entry:
%0 = ashr <8 x i16> %a, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
ret <8 x i16> %0
}
define arm_aapcs_vfpcc <4 x i32> @test_vshrq_n_s32(<4 x i32> %a) {
; CHECK-LABEL: test_vshrq_n_s32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vshr.s32 q0, q0, #19
; CHECK-NEXT: bx lr
entry:
%0 = ashr <4 x i32> %a, <i32 19, i32 19, i32 19, i32 19>
ret <4 x i32> %0
}
define arm_aapcs_vfpcc <16 x i8> @test_vshrq_n_u8(<16 x i8> %a) {
; CHECK-LABEL: test_vshrq_n_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vshr.u8 q0, q0, #1
; CHECK-NEXT: bx lr
entry:
%0 = lshr <16 x i8> %a, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
ret <16 x i8> %0
}
define arm_aapcs_vfpcc <8 x i16> @test_vshrq_n_u16(<8 x i16> %a) {
; CHECK-LABEL: test_vshrq_n_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vshr.u16 q0, q0, #10
; CHECK-NEXT: bx lr
entry:
%0 = lshr <8 x i16> %a, <i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10, i16 10>
ret <8 x i16> %0
}
define arm_aapcs_vfpcc <4 x i32> @test_vshrq_n_u32(<4 x i32> %a) {
; CHECK-LABEL: test_vshrq_n_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vshr.u32 q0, q0, #10
; CHECK-NEXT: bx lr
entry:
%0 = lshr <4 x i32> %a, <i32 10, i32 10, i32 10, i32 10>
ret <4 x i32> %0
}
define arm_aapcs_vfpcc <16 x i8> @test_vshlq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i16 zeroext %p) {
; CHECK-LABEL: test_vshlq_m_n_s8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vshlt.i8 q0, q1, #6
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
%2 = tail call <16 x i8> @llvm.arm.mve.shl.imm.predicated.v16i8.v16i1(<16 x i8> %a, i32 6, <16 x i1> %1, <16 x i8> %inactive)
ret <16 x i8> %2
}
define arm_aapcs_vfpcc <8 x i16> @test_vshlq_m_n_s16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %p) {
; CHECK-LABEL: test_vshlq_m_n_s16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vshlt.i16 q0, q1, #13
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
%2 = tail call <8 x i16> @llvm.arm.mve.shl.imm.predicated.v8i16.v8i1(<8 x i16> %a, i32 13, <8 x i1> %1, <8 x i16> %inactive)
ret <8 x i16> %2
}
define arm_aapcs_vfpcc <4 x i32> @test_vshlq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i16 zeroext %p) {
; CHECK-LABEL: test_vshlq_m_n_s32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vshlt.i32 q0, q1, #0
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = tail call <4 x i32> @llvm.arm.mve.shl.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 0, <4 x i1> %1, <4 x i32> %inactive)
ret <4 x i32> %2
}
define arm_aapcs_vfpcc <16 x i8> @test_vshrq_m_n_s8(<16 x i8> %inactive, <16 x i8> %a, i16 zeroext %p) {
; CHECK-LABEL: test_vshrq_m_n_s8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vshrt.s8 q0, q1, #2
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
%2 = tail call <16 x i8> @llvm.arm.mve.shr.imm.predicated.v16i8.v16i1(<16 x i8> %a, i32 2, i32 0, <16 x i1> %1, <16 x i8> %inactive)
ret <16 x i8> %2
}
define arm_aapcs_vfpcc <8 x i16> @test_vshrq_m_n_s16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %p) {
; CHECK-LABEL: test_vshrq_m_n_s16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vshrt.s16 q0, q1, #3
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
%2 = tail call <8 x i16> @llvm.arm.mve.shr.imm.predicated.v8i16.v8i1(<8 x i16> %a, i32 3, i32 0, <8 x i1> %1, <8 x i16> %inactive)
ret <8 x i16> %2
}
define arm_aapcs_vfpcc <4 x i32> @test_vshrq_m_n_s32(<4 x i32> %inactive, <4 x i32> %a, i16 zeroext %p) {
; CHECK-LABEL: test_vshrq_m_n_s32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vshrt.s32 q0, q1, #13
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = tail call <4 x i32> @llvm.arm.mve.shr.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 13, i32 0, <4 x i1> %1, <4 x i32> %inactive)
ret <4 x i32> %2
}
define arm_aapcs_vfpcc <16 x i8> @test_vshrq_m_n_u8(<16 x i8> %inactive, <16 x i8> %a, i16 zeroext %p) {
; CHECK-LABEL: test_vshrq_m_n_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vshrt.u8 q0, q1, #4
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
%2 = tail call <16 x i8> @llvm.arm.mve.shr.imm.predicated.v16i8.v16i1(<16 x i8> %a, i32 4, i32 1, <16 x i1> %1, <16 x i8> %inactive)
ret <16 x i8> %2
}
define arm_aapcs_vfpcc <8 x i16> @test_vshrq_m_n_u16(<8 x i16> %inactive, <8 x i16> %a, i16 zeroext %p) {
; CHECK-LABEL: test_vshrq_m_n_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vshrt.u16 q0, q1, #14
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
%2 = tail call <8 x i16> @llvm.arm.mve.shr.imm.predicated.v8i16.v8i1(<8 x i16> %a, i32 14, i32 1, <8 x i1> %1, <8 x i16> %inactive)
ret <8 x i16> %2
}
define arm_aapcs_vfpcc <4 x i32> @test_vshrq_m_n_u32(<4 x i32> %inactive, <4 x i32> %a, i16 zeroext %p) {
; CHECK-LABEL: test_vshrq_m_n_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vshrt.u32 q0, q1, #21
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = tail call <4 x i32> @llvm.arm.mve.shr.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 21, i32 1, <4 x i1> %1, <4 x i32> %inactive)
ret <4 x i32> %2
}
define arm_aapcs_vfpcc <16 x i8> @test_vshlq_x_n_s8(<16 x i8> %a, i16 zeroext %p) {
; CHECK-LABEL: test_vshlq_x_n_s8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vshlt.i8 q0, q0, #1
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
%2 = tail call <16 x i8> @llvm.arm.mve.shl.imm.predicated.v16i8.v16i1(<16 x i8> %a, i32 1, <16 x i1> %1, <16 x i8> undef)
ret <16 x i8> %2
}
define arm_aapcs_vfpcc <8 x i16> @test_vshlq_x_n_s16(<8 x i16> %a, i16 zeroext %p) {
; CHECK-LABEL: test_vshlq_x_n_s16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vshlt.i16 q0, q0, #15
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
%2 = tail call <8 x i16> @llvm.arm.mve.shl.imm.predicated.v8i16.v8i1(<8 x i16> %a, i32 15, <8 x i1> %1, <8 x i16> undef)
ret <8 x i16> %2
}
define arm_aapcs_vfpcc <4 x i32> @test_vshlq_x_n_s32(<4 x i32> %a, i16 zeroext %p) {
; CHECK-LABEL: test_vshlq_x_n_s32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vshlt.i32 q0, q0, #13
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = tail call <4 x i32> @llvm.arm.mve.shl.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 13, <4 x i1> %1, <4 x i32> undef)
ret <4 x i32> %2
}
define arm_aapcs_vfpcc <16 x i8> @test_vshlq_x_n_u8(<16 x i8> %a, i16 zeroext %p) {
; CHECK-LABEL: test_vshlq_x_n_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vshlt.i8 q0, q0, #4
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
%2 = tail call <16 x i8> @llvm.arm.mve.shl.imm.predicated.v16i8.v16i1(<16 x i8> %a, i32 4, <16 x i1> %1, <16 x i8> undef)
ret <16 x i8> %2
}
define arm_aapcs_vfpcc <8 x i16> @test_vshlq_x_n_u16(<8 x i16> %a, i16 zeroext %p) {
; CHECK-LABEL: test_vshlq_x_n_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vshlt.i16 q0, q0, #10
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
%2 = tail call <8 x i16> @llvm.arm.mve.shl.imm.predicated.v8i16.v8i1(<8 x i16> %a, i32 10, <8 x i1> %1, <8 x i16> undef)
ret <8 x i16> %2
}
define arm_aapcs_vfpcc <4 x i32> @test_vshlq_x_n_u32(<4 x i32> %a, i16 zeroext %p) {
; CHECK-LABEL: test_vshlq_x_n_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vshlt.i32 q0, q0, #30
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = tail call <4 x i32> @llvm.arm.mve.shl.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 30, <4 x i1> %1, <4 x i32> undef)
ret <4 x i32> %2
}
define arm_aapcs_vfpcc <16 x i8> @test_vshrq_x_n_s8(<16 x i8> %a, i16 zeroext %p) {
; CHECK-LABEL: test_vshrq_x_n_s8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vshrt.s8 q0, q0, #4
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
%2 = tail call <16 x i8> @llvm.arm.mve.shr.imm.predicated.v16i8.v16i1(<16 x i8> %a, i32 4, i32 0, <16 x i1> %1, <16 x i8> undef)
ret <16 x i8> %2
}
define arm_aapcs_vfpcc <8 x i16> @test_vshrq_x_n_s16(<8 x i16> %a, i16 zeroext %p) {
; CHECK-LABEL: test_vshrq_x_n_s16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vshrt.s16 q0, q0, #10
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
%2 = tail call <8 x i16> @llvm.arm.mve.shr.imm.predicated.v8i16.v8i1(<8 x i16> %a, i32 10, i32 0, <8 x i1> %1, <8 x i16> undef)
ret <8 x i16> %2
}
define arm_aapcs_vfpcc <4 x i32> @test_vshrq_x_n_s32(<4 x i32> %a, i16 zeroext %p) {
; CHECK-LABEL: test_vshrq_x_n_s32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vshrt.s32 q0, q0, #7
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = tail call <4 x i32> @llvm.arm.mve.shr.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 7, i32 0, <4 x i1> %1, <4 x i32> undef)
ret <4 x i32> %2
}
define arm_aapcs_vfpcc <16 x i8> @test_vshrq_x_n_u8(<16 x i8> %a, i16 zeroext %p) {
; CHECK-LABEL: test_vshrq_x_n_u8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vshrt.u8 q0, q0, #7
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0)
%2 = tail call <16 x i8> @llvm.arm.mve.shr.imm.predicated.v16i8.v16i1(<16 x i8> %a, i32 7, i32 1, <16 x i1> %1, <16 x i8> undef)
ret <16 x i8> %2
}
define arm_aapcs_vfpcc <8 x i16> @test_vshrq_x_n_u16(<8 x i16> %a, i16 zeroext %p) {
; CHECK-LABEL: test_vshrq_x_n_u16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vshrt.u16 q0, q0, #7
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0)
%2 = tail call <8 x i16> @llvm.arm.mve.shr.imm.predicated.v8i16.v8i1(<8 x i16> %a, i32 7, i32 1, <8 x i1> %1, <8 x i16> undef)
ret <8 x i16> %2
}
define arm_aapcs_vfpcc <4 x i32> @test_vshrq_x_n_u32(<4 x i32> %a, i16 zeroext %p) {
; CHECK-LABEL: test_vshrq_x_n_u32:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpst
; CHECK-NEXT: vshrt.u32 q0, q0, #6
; CHECK-NEXT: bx lr
entry:
%0 = zext i16 %p to i32
%1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0)
%2 = tail call <4 x i32> @llvm.arm.mve.shr.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 6, i32 1, <4 x i1> %1, <4 x i32> undef)
ret <4 x i32> %2
}
declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32)
declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
declare <16 x i8> @llvm.arm.mve.shl.imm.predicated.v16i8.v16i1(<16 x i8>, i32, <16 x i1>, <16 x i8>)
declare <8 x i16> @llvm.arm.mve.shl.imm.predicated.v8i16.v8i1(<8 x i16>, i32, <8 x i1>, <8 x i16>)
declare <4 x i32> @llvm.arm.mve.shl.imm.predicated.v4i32.v4i1(<4 x i32>, i32, <4 x i1>, <4 x i32>)
declare <16 x i8> @llvm.arm.mve.shr.imm.predicated.v16i8.v16i1(<16 x i8>, i32, i32, <16 x i1>, <16 x i8>)
declare <8 x i16> @llvm.arm.mve.shr.imm.predicated.v8i16.v8i1(<8 x i16>, i32, i32, <8 x i1>, <8 x i16>)
declare <4 x i32> @llvm.arm.mve.shr.imm.predicated.v4i32.v4i1(<4 x i32>, i32, i32, <4 x i1>, <4 x i32>)