1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-23 11:13:28 +01:00

[ARM,MVE] Rename and clean up VCTP IR intrinsics.

Summary:
D65884 added a set of Arm IR intrinsics for the MVE VCTP instruction,
to use in tail predication. But the 64-bit one doesn't work properly:
its predicate type is `<2 x i1>` / `v2i1`, which isn't a legal MVE
type (due to not having a full set of instructions that manipulate it
usefully). The test of `vctp64` in `basic-tail-pred.ll` goes through
`opt` fine, as the test expects, but if you then feed it to `llc` it
causes a type legality failure at isel time.

The usual workaround we've been using in the rest of the MVE
intrinsics family is to bodge `v2i1` into `v4i1`. So I've adjusted the
`vctp64` IR intrinsic to do that, and completely removed the code (and
test) that uses that intrinsic for 64-bit tail predication. That will
allow me to add isel rules (upcoming in D70485) that actually generate
the VCTP64 instruction.

Also renamed all four of these IR intrinsics so that they have `mve`
in the name, since its absence was confusing.

Reviewers: ostannard, MarkMurrayARM, dmgreen

Reviewed By: MarkMurrayARM

Subscribers: samparker, kristof.beyls, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D70592
This commit is contained in:
Simon Tatham 2019-12-02 16:18:24 +00:00
parent e5350535e5
commit 80f3d74e98
10 changed files with 40 additions and 82 deletions

View File

@ -773,10 +773,11 @@ class Neon_Dot_Intrinsic
def int_arm_neon_udot : Neon_Dot_Intrinsic;
def int_arm_neon_sdot : Neon_Dot_Intrinsic;
def int_arm_vctp8 : Intrinsic<[llvm_v16i1_ty], [llvm_i32_ty], [IntrNoMem]>;
def int_arm_vctp16 : Intrinsic<[llvm_v8i1_ty], [llvm_i32_ty], [IntrNoMem]>;
def int_arm_vctp32 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>;
def int_arm_vctp64 : Intrinsic<[llvm_v2i1_ty], [llvm_i32_ty], [IntrNoMem]>;
def int_arm_mve_vctp8 : Intrinsic<[llvm_v16i1_ty], [llvm_i32_ty], [IntrNoMem]>;
def int_arm_mve_vctp16 : Intrinsic<[llvm_v8i1_ty], [llvm_i32_ty], [IntrNoMem]>;
def int_arm_mve_vctp32 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>;
// vctp64 takes v4i1, to work around v2i1 not being a legal MVE type
def int_arm_mve_vctp64 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>;
// v8.3-A Floating-point complex add
def int_arm_neon_vcadd_rot90 : Neon_2Arg_Intrinsic;

View File

@ -4291,11 +4291,11 @@ def MVE_VCTP32 : MVE_VCTP<"32", 0b10>;
def MVE_VCTP64 : MVE_VCTP<"64", 0b11>;
let Predicates = [HasMVEInt] in {
def : Pat<(int_arm_vctp8 rGPR:$Rn),
def : Pat<(int_arm_mve_vctp8 rGPR:$Rn),
(v16i1 (MVE_VCTP8 rGPR:$Rn))>;
def : Pat<(int_arm_vctp16 rGPR:$Rn),
def : Pat<(int_arm_mve_vctp16 rGPR:$Rn),
(v8i1 (MVE_VCTP16 rGPR:$Rn))>;
def : Pat<(int_arm_vctp32 rGPR:$Rn),
def : Pat<(int_arm_mve_vctp32 rGPR:$Rn),
(v4i1 (MVE_VCTP32 rGPR:$Rn))>;
}

View File

@ -485,10 +485,15 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
switch (VecTy->getNumElements()) {
default:
llvm_unreachable("unexpected number of lanes");
case 2: VCTPID = Intrinsic::arm_vctp64; break;
case 4: VCTPID = Intrinsic::arm_vctp32; break;
case 8: VCTPID = Intrinsic::arm_vctp16; break;
case 16: VCTPID = Intrinsic::arm_vctp8; break;
case 4: VCTPID = Intrinsic::arm_mve_vctp32; break;
case 8: VCTPID = Intrinsic::arm_mve_vctp16; break;
case 16: VCTPID = Intrinsic::arm_mve_vctp8; break;
// FIXME: vctp64 currently not supported because the predicate
// vector wants to be <2 x i1>, but v2i1 is not a legal MVE
// type, so problems happen at isel time.
// Intrinsic::arm_mve_vctp64 exists for ACLE intrinsics
// purposes, but takes a v4i1 instead of a v2i1.
}
Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
Value *TailPredicate = Builder.CreateCall(VCTP, Processed);

View File

@ -4,7 +4,7 @@
; CHECK: vector.body:
; CHECK: %index = phi i32
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.vctp8(i32 [[ELEMS]])
; CHECK: [[VCTP:%[^ ]+]] = call <16 x i1> @llvm.arm.mve.vctp8(i32 [[ELEMS]])
; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 16
; CHECK: [[LD0:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
; CHECK: [[LD1:%[^ ]+]] = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* {{.*}}, i32 4, <16 x i1> [[VCTP]], <16 x i8> undef)
@ -57,7 +57,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry
; CHECK: vector.body:
; CHECK: %index = phi i32
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]])
; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 8
; CHECK: [[LD0:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
; CHECK: [[LD1:%[^ ]+]] = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
@ -109,7 +109,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry
; CHECK-LABEL: mul_v4i32
; CHECK: vector.body:
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
@ -158,59 +158,11 @@ for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
; CHECK-LABEL: copy_v2i64
; CHECK: vector.body:
; CHECK: %index = phi i32
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
; CHECK: [[VCTP:%[^ ]+]] = call <2 x i1> @llvm.arm.vctp64(i32 [[ELEMS]])
; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 2
; CHECK: [[LD0:%[^ ]+]] = tail call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* {{.*}}, i32 4, <2 x i1> [[VCTP]], <2 x i64> undef)
; CHECK: tail call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> [[LD0]], <2 x i64>* {{.*}}, i32 4, <2 x i1> [[VCTP]])
define void @copy_v2i64(i64* %a, i64* %b, i32 %N) {
entry:
%cmp8 = icmp eq i32 %N, 0
%tmp8 = add i32 %N, 1
%tmp9 = lshr i32 %tmp8, 1
%tmp10 = shl nuw i32 %tmp9, 1
%tmp11 = add i32 %tmp10, -2
%tmp12 = lshr i32 %tmp11, 1
%tmp13 = add nuw nsw i32 %tmp12, 1
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
vector.ph: ; preds = %entry
%trip.count.minus.1 = add i32 %N, -1
%broadcast.splatinsert10 = insertelement <2 x i32> undef, i32 %trip.count.minus.1, i32 0
%broadcast.splat11 = shufflevector <2 x i32> %broadcast.splatinsert10, <2 x i32> undef, <2 x i32> zeroinitializer
call void @llvm.set.loop.iterations.i32(i32 %tmp13)
br label %vector.body
vector.body: ; preds = %vector.body, %vector.ph
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
%tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
%broadcast.splatinsert = insertelement <2 x i32> undef, i32 %index, i32 0
%broadcast.splat = shufflevector <2 x i32> %broadcast.splatinsert, <2 x i32> undef, <2 x i32> zeroinitializer
%induction = add <2 x i32> %broadcast.splat, <i32 0, i32 1>
%tmp1 = icmp ule <2 x i32> %induction, %broadcast.splat11
%tmp = getelementptr inbounds i64, i64* %a, i32 %index
%tmp2 = bitcast i64* %tmp to <2 x i64>*
%wide.masked.load = tail call <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>* %tmp2, i32 4, <2 x i1> %tmp1, <2 x i64> undef)
%tmp3 = getelementptr inbounds i64, i64* %b, i32 %index
%tmp7 = bitcast i64* %tmp3 to <2 x i64>*
tail call void @llvm.masked.store.v2i64.p0v2i64(<2 x i64> %wide.masked.load, <2 x i64>* %tmp7, i32 4, <2 x i1> %tmp1)
%index.next = add i32 %index, 2
%tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
%tmp16 = icmp ne i32 %tmp15, 0
br i1 %tmp16, label %vector.body, label %for.cond.cleanup
for.cond.cleanup: ; preds = %vector.body, %entry
ret void
}
; CHECK-LABEL: split_vector
; CHECK: vector.body:
; CHECK: %index = phi i32
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
@ -268,7 +220,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry
; One of the loads now uses ult predicate.
; CHECK-LABEL: mismatch_load_pred
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> %wrong, <4 x i32> undef)
@ -322,7 +274,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry
; CHECK-LABEL: mismatch_store_pred
; CHECK: %index = phi i32
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[REMAINING:%[^ ]+]], %vector.body ]
; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELEMS]])
; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELEMS]])
; CHECK: [[REMAINING]] = sub i32 [[ELEMS]], 4
; CHECK: [[LD0:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)
; CHECK: [[LD1:%[^ ]+]] = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]], <4 x i32> undef)

View File

@ -28,7 +28,7 @@ define void @mat_vec_sext_i16(i16** nocapture readonly %A, i16* nocapture readon
; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, i16* [[TMP3]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.vctp32(i32 [[TMP0]])
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4
; CHECK-NEXT: [[TMP8:%.*]] = bitcast i16* [[TMP6]] to <4 x i16>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP8]], i32 2, <4 x i1> [[TMP1]], <4 x i16> undef)
@ -140,7 +140,7 @@ define void @mat_vec_i32(i32** nocapture readonly %A, i32* nocapture readonly %B
; CHECK-NEXT: [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[N]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[TMP3]], i32 [[INDEX]]
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.vctp32(i32 [[TMP0]])
; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP0]])
; CHECK-NEXT: [[TMP2]] = sub i32 [[TMP0]], 4
; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP8]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)

View File

@ -1,7 +1,7 @@
; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -disable-mve-tail-predication=false -mattr=+mve,+lob %s -S -o - | FileCheck %s
; CHECK-LABEL: expand_v8i16_v8i32
; CHECK-NOT: call i32 @llvm.arm.vctp
; CHECK-NOT: call i32 @llvm.arm.mve.vctp
define void @expand_v8i16_v8i32(i16* noalias nocapture readonly %a, i16* noalias nocapture readonly %b, i32* noalias nocapture %c, i32 %N) {
entry:
%cmp8 = icmp eq i32 %N, 0
@ -50,7 +50,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry
; CHECK-LABEL: expand_v8i16_v4i32
; CHECK: [[ELEMS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[ELEMS_REM:%[^ ]+]], %vector.body ]
; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[ELEMS]])
; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[ELEMS]])
; CHECK: [[ELEMS_REM]] = sub i32 [[ELEMS]], 8
; CHECK: tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* {{.*}}, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
; CHECK: %store.pred = icmp ule <4 x i32> %induction.store
@ -117,7 +117,7 @@ for.cond.cleanup: ; preds = %vector.body, %entry
}
; CHECK-LABEL: expand_v4i32_v4i64
; CHECK-NOT: call i32 @llvm.arm.vctp
; CHECK-NOT: call i32 @llvm.arm.mve.vctp
define void @expand_v4i32_v4i64(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i64* noalias nocapture %c, i32 %N) {
entry:
%cmp8 = icmp eq i32 %N, 0

View File

@ -5,7 +5,7 @@
; CHECK: phi <8 x i16> [ zeroinitializer, %entry ]
; CHECK: phi i32
; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ]
; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[PHI]])
; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]])
; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8
; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp6, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
@ -63,7 +63,7 @@ middle.block: ; preds = %vector.body
; CHECK: phi <8 x i16> [ zeroinitializer, %entry ]
; CHECK: phi i32
; CHECK: [[PHI:%[^ ]+]] = phi i32 [ %N, %entry ], [ [[ELEMS:%[^ ]+]], %vector.body ]
; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.vctp16(i32 [[PHI]])
; CHECK: [[VCTP:%[^ ]+]] = call <8 x i1> @llvm.arm.mve.vctp16(i32 [[PHI]])
; CHECK: [[ELEMS]] = sub i32 [[PHI]], 8
; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp4, i32 4, <8 x i1> [[VCTP]], <8 x i16> undef)
define i16 @reduction_i32_with_scalar(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr {

View File

@ -6,13 +6,13 @@
; CHECK: vector.body:
; CHECK-NOT: phi i32 [ 0, %vector.ph ]
; CHECK: [[ELTS:%[^ ]+]] = phi i32 [ %N, %vector.ph ], [ [[SUB:%[^ ]+]], %vector.body ]
; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELTS]])
; CHECK: [[VCTP:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELTS]])
; CHECK: [[SUB]] = sub i32 [[ELTS]], 4
; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]]
; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]],
; CHECK: middle.block:
; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.vctp32(i32 [[ELTS]])
; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[ELTS]])
; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]],
; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])

View File

@ -10,7 +10,7 @@ define void @vctp8(i32 %arg, <16 x i8> *%in, <16 x i8>* %out) {
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vstrw.32 q0, [r2]
; CHECK-NEXT: bx lr
%pred = call <16 x i1> @llvm.arm.vctp8(i32 %arg)
%pred = call <16 x i1> @llvm.arm.mve.vctp8(i32 %arg)
%ld = load <16 x i8>, <16 x i8>* %in
%res = select <16 x i1> %pred, <16 x i8> %ld, <16 x i8> zeroinitializer
store <16 x i8> %res, <16 x i8>* %out
@ -26,7 +26,7 @@ define void @vctp16(i32 %arg, <8 x i16> *%in, <8 x i16>* %out) {
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vstrw.32 q0, [r2]
; CHECK-NEXT: bx lr
%pred = call <8 x i1> @llvm.arm.vctp16(i32 %arg)
%pred = call <8 x i1> @llvm.arm.mve.vctp16(i32 %arg)
%ld = load <8 x i16>, <8 x i16>* %in
%res = select <8 x i1> %pred, <8 x i16> %ld, <8 x i16> zeroinitializer
store <8 x i16> %res, <8 x i16>* %out
@ -42,13 +42,13 @@ define void @vctp32(i32 %arg, <4 x i32> *%in, <4 x i32>* %out) {
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vstrw.32 q0, [r2]
; CHECK-NEXT: bx lr
%pred = call <4 x i1> @llvm.arm.vctp32(i32 %arg)
%pred = call <4 x i1> @llvm.arm.mve.vctp32(i32 %arg)
%ld = load <4 x i32>, <4 x i32>* %in
%res = select <4 x i1> %pred, <4 x i32> %ld, <4 x i32> zeroinitializer
store <4 x i32> %res, <4 x i32>* %out
ret void
}
declare <16 x i1> @llvm.arm.vctp8(i32)
declare <8 x i1> @llvm.arm.vctp16(i32)
declare <4 x i1> @llvm.arm.vctp32(i32)
declare <16 x i1> @llvm.arm.mve.vctp8(i32)
declare <8 x i1> @llvm.arm.mve.vctp16(i32)
declare <4 x i1> @llvm.arm.mve.vctp32(i32)

View File

@ -27,7 +27,7 @@ define arm_aapcs_vfpcc <8 x i16> @test_vpnot(<8 x i16> %v, <8 x i16> %w, <8 x i1
; CHECK-NEXT: vaddt.i16 q0, q1, q2
; CHECK-NEXT: bx lr
entry:
%0 = call <8 x i1> @llvm.arm.vctp16(i32 %n)
%0 = call <8 x i1> @llvm.arm.mve.vctp16(i32 %n)
%1 = call i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1> %0)
%2 = trunc i32 %1 to i16
%3 = xor i16 %2, -1
@ -40,5 +40,5 @@ entry:
declare i32 @llvm.arm.mve.pred.v2i.v8i1(<8 x i1>)
declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32)
declare <8 x i16> @llvm.arm.mve.add.predicated.v8i16.v8i1(<8 x i16>, <8 x i16>, <8 x i1>, <8 x i16>)
declare <8 x i1> @llvm.arm.vctp16(i32)
declare <8 x i1> @llvm.arm.mve.vctp16(i32)