1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[ARM] Rejig some of the MVE gather/scatter lowering pass. NFC

This adjusts some of how the gather/scatter lowering pass passes around
data and where certain gathers/scatters are created from. It should not
effect code generation on its own, but allows other patches to more
clearly reason about the code.

A number of extra test cases were also added for smaller gathers/
scatters that can be extended, and some of the test comments were
updated.
This commit is contained in:
David Green 2021-06-15 15:38:39 +01:00
parent 30958bdc95
commit e55f90b072
6 changed files with 200 additions and 56 deletions

View File

@ -84,8 +84,8 @@ private:
// Check for a getelementptr and deduce base and offsets from it, on success
// returning the base directly and the offsets indirectly using the Offsets
// argument
Value *checkGEP(Value *&Offsets, FixedVectorType *Ty, GetElementPtrInst *GEP,
IRBuilder<> &Builder);
Value *decomposeGEP(Value *&Offsets, FixedVectorType *Ty,
GetElementPtrInst *GEP, IRBuilder<> &Builder);
// Compute the scale of this gather/scatter instruction
int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize);
// If the value is a constant, or derived from constants via additions
@ -123,8 +123,7 @@ private:
// QI gathers and scatters can increment their offsets on their own if
// the increment is a constant value (digit)
Value *tryCreateIncrementingGatScat(IntrinsicInst *I, Value *BasePtr,
Value *Ptr, GetElementPtrInst *GEP,
Value *tryCreateIncrementingGatScat(IntrinsicInst *I, Value *Ptr,
IRBuilder<> &Builder);
// QI gathers/scatters can increment their offsets on their own if the
// increment is a constant value (digit) - this creates a writeback QI
@ -214,9 +213,10 @@ static bool checkOffsetSize(Value *Offsets, unsigned TargetElemCount) {
return true;
}
Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, FixedVectorType *Ty,
GetElementPtrInst *GEP,
IRBuilder<> &Builder) {
Value *MVEGatherScatterLowering::decomposeGEP(Value *&Offsets,
FixedVectorType *Ty,
GetElementPtrInst *GEP,
IRBuilder<> &Builder) {
if (!GEP) {
LLVM_DEBUG(dbgs() << "masked gathers/scatters: no getelementpointer "
<< "found\n");
@ -372,7 +372,10 @@ Value *MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
Builder.SetCurrentDebugLocation(I->getDebugLoc());
Instruction *Root = I;
Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Root, Builder);
Value *Load = tryCreateIncrementingGatScat(I, Ptr, Builder);
if (!Load)
Load = tryCreateMaskedGatherOffset(I, Ptr, Root, Builder);
if (!Load)
Load = tryCreateMaskedGatherBase(I, Ptr, Builder);
if (!Load)
@ -478,14 +481,9 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
Value *Offsets;
Value *BasePtr =
checkGEP(Offsets, cast<FixedVectorType>(ResultTy), GEP, Builder);
decomposeGEP(Offsets, cast<FixedVectorType>(ResultTy), GEP, Builder);
if (!BasePtr)
return nullptr;
// Check whether the offset is a constant increment that could be merged into
// a QI gather
Value *Load = tryCreateIncrementingGatScat(I, BasePtr, Offsets, GEP, Builder);
if (Load)
return Load;
int Scale =
computeScale(GEP->getSourceElementType()->getPrimitiveSizeInBits(),
@ -533,7 +531,9 @@ Value *MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) {
Builder.SetInsertPoint(I);
Builder.SetCurrentDebugLocation(I->getDebugLoc());
Value *Store = tryCreateMaskedScatterOffset(I, Ptr, Builder);
Value *Store = tryCreateIncrementingGatScat(I, Ptr, Builder);
if (!Store)
Store = tryCreateMaskedScatterOffset(I, Ptr, Builder);
if (!Store)
Store = tryCreateMaskedScatterBase(I, Ptr, Builder);
if (!Store)
@ -598,6 +598,7 @@ Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
Value *Mask = I->getArgOperand(3);
Type *InputTy = Input->getType();
Type *MemoryTy = InputTy;
LLVM_DEBUG(dbgs() << "masked scatters: getelementpointer found. Storing"
<< " to base + vector of offsets\n");
// If the input has been truncated, try to integrate that trunc into the
@ -619,15 +620,10 @@ Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
Value *Offsets;
Value *BasePtr =
checkGEP(Offsets, cast<FixedVectorType>(InputTy), GEP, Builder);
decomposeGEP(Offsets, cast<FixedVectorType>(InputTy), GEP, Builder);
if (!BasePtr)
return nullptr;
// Check whether the offset is a constant increment that could be merged into
// a QI gather
Value *Store =
tryCreateIncrementingGatScat(I, BasePtr, Offsets, GEP, Builder);
if (Store)
return Store;
int Scale =
computeScale(GEP->getSourceElementType()->getPrimitiveSizeInBits(),
MemoryTy->getScalarSizeInBits());
@ -652,21 +648,28 @@ Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
}
Value *MVEGatherScatterLowering::tryCreateIncrementingGatScat(
IntrinsicInst *I, Value *BasePtr, Value *Offsets, GetElementPtrInst *GEP,
IRBuilder<> &Builder) {
IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder) {
FixedVectorType *Ty;
if (I->getIntrinsicID() == Intrinsic::masked_gather)
Ty = cast<FixedVectorType>(I->getType());
else
Ty = cast<FixedVectorType>(I->getArgOperand(0)->getType());
// Incrementing gathers only exist for v4i32
if (Ty->getNumElements() != 4 ||
Ty->getScalarSizeInBits() != 32)
if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32)
return nullptr;
// Incrementing gathers are not beneficial outside of a loop
Loop *L = LI->getLoopFor(I->getParent());
if (L == nullptr)
// Incrementing gathers are not beneficial outside of a loop
return nullptr;
// Decompose the GEP into Base and Offsets
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
Value *Offsets;
Value *BasePtr = decomposeGEP(Offsets, Ty, GEP, Builder);
if (!BasePtr)
return nullptr;
LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to build incrementing "
"wb gather/scatter\n");
@ -689,6 +692,7 @@ Value *MVEGatherScatterLowering::tryCreateIncrementingGatScat(
if (Load != nullptr)
return Load;
}
LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to build incrementing "
"non-wb gather/scatter\n");

View File

@ -273,6 +273,25 @@ entry:
ret <4 x i32> %ext
}
define arm_aapcs_vfpcc <4 x i16> @ptr_v4i16(<4 x i16*>* %offptr) {
; CHECK-LABEL: ptr_v4i16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vmov r0, r1, d1
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: ldrh r0, [r0]
; CHECK-NEXT: ldrh r2, [r2]
; CHECK-NEXT: ldrh r1, [r1]
; CHECK-NEXT: ldrh r3, [r3]
; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4
%gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
ret <4 x i16> %gather
}
define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_sext(<8 x i16*>* %offptr) {
; CHECK-LABEL: ptr_v8i16_sext:
; CHECK: @ %bb.0: @ %entry
@ -369,6 +388,25 @@ entry:
ret <8 x half> %gather
}
define arm_aapcs_vfpcc <4 x half> @ptr_v4f16(<4 x half*>* %offptr) {
; CHECK-LABEL: ptr_v4f16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vmov r0, r1, d2
; CHECK-NEXT: vldr.16 s8, [r1]
; CHECK-NEXT: vldr.16 s0, [r0]
; CHECK-NEXT: vmov r0, r1, d3
; CHECK-NEXT: vins.f16 s0, s8
; CHECK-NEXT: vldr.16 s4, [r1]
; CHECK-NEXT: vldr.16 s1, [r0]
; CHECK-NEXT: vins.f16 s1, s4
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x half*>, <4 x half*>* %offptr, align 4
%gather = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x half> undef)
ret <4 x half> %gather
}
; i8
define arm_aapcs_vfpcc <16 x i8> @ptr_i8(<16 x i8*>* %offptr) {
@ -499,6 +537,40 @@ entry:
ret <8 x i16> %ext
}
define arm_aapcs_vfpcc <8 x i8> @ptr_v8i8(<8 x i8*>* %offptr) {
; CHECK-LABEL: ptr_v8i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, lr}
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
; CHECK-NEXT: vmov r1, r2, d0
; CHECK-NEXT: vmov r3, r12, d1
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vmov r4, r5, d0
; CHECK-NEXT: vmov r0, lr, d1
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: ldrb r6, [r3]
; CHECK-NEXT: ldrb r2, [r2]
; CHECK-NEXT: ldrb r4, [r4]
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: vmov.16 q0[0], r4
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: vmov.16 q0[1], r5
; CHECK-NEXT: ldrb.w r3, [lr]
; CHECK-NEXT: vmov.16 q0[2], r0
; CHECK-NEXT: ldrb.w r12, [r12]
; CHECK-NEXT: vmov.16 q0[3], r3
; CHECK-NEXT: vmov.16 q0[4], r1
; CHECK-NEXT: vmov.16 q0[5], r2
; CHECK-NEXT: vmov.16 q0[6], r6
; CHECK-NEXT: vmov.16 q0[7], r12
; CHECK-NEXT: pop {r4, r5, r6, pc}
entry:
%offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4
%gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
ret <8 x i8> %gather
}
define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_sext32(<4 x i8*>* %offptr) {
; CHECK-LABEL: ptr_v4i8_sext32:
; CHECK: @ %bb.0: @ %entry
@ -543,6 +615,25 @@ entry:
ret <4 x i32> %ext
}
define arm_aapcs_vfpcc <4 x i8> @ptr_v4i8(<4 x i8*>* %offptr) {
; CHECK-LABEL: ptr_v4i8:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q0, [r0]
; CHECK-NEXT: vmov r0, r1, d1
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: ldrb r0, [r0]
; CHECK-NEXT: ldrb r2, [r2]
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: ldrb r3, [r3]
; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4
%gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %offs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
ret <4 x i8> %gather
}
define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_sext32(<8 x i8*>* %offptr) {
; CHECK-LABEL: ptr_v8i8_sext32:
; CHECK: @ %bb.0: @ %entry
@ -623,17 +714,17 @@ define void @foo_ptr_p_int32_t(i32* %dest, i32** %src, i32 %n) {
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: .LBB22_1: @ %vector.body.preheader
; CHECK-NEXT: .LBB26_1: @ %vector.body.preheader
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r2, lsr #2
; CHECK-NEXT: .LBB22_2: @ %vector.body
; CHECK-NEXT: .LBB26_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vptt.i32 ne, q0, zr
; CHECK-NEXT: vldrwt.u32 q1, [q0]
; CHECK-NEXT: vstrwt.32 q1, [r0], #16
; CHECK-NEXT: le lr, .LBB22_2
; CHECK-NEXT: le lr, .LBB26_2
; CHECK-NEXT: @ %bb.3: @ %for.end
; CHECK-NEXT: pop {r7, pc}
entry:
@ -668,17 +759,17 @@ define void @foo_ptr_p_float(float* %dest, float** %src, i32 %n) {
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: .LBB23_1: @ %vector.body.preheader
; CHECK-NEXT: .LBB27_1: @ %vector.body.preheader
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: movs r3, #1
; CHECK-NEXT: add.w lr, r3, r2, lsr #2
; CHECK-NEXT: .LBB23_2: @ %vector.body
; CHECK-NEXT: .LBB27_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: vptt.i32 ne, q0, zr
; CHECK-NEXT: vldrwt.u32 q1, [q0]
; CHECK-NEXT: vstrwt.32 q1, [r0], #16
; CHECK-NEXT: le lr, .LBB23_2
; CHECK-NEXT: le lr, .LBB27_2
; CHECK-NEXT: @ %bb.3: @ %for.end
; CHECK-NEXT: pop {r7, pc}
entry:

View File

@ -47,7 +47,7 @@ entry:
ret void
}
; Expand
; Expand - sext offset
define arm_aapcs_vfpcc void @scaled_v8i16_sext(i16* %base, <8 x i16>* %offptr, <8 x i16> %input) {
; CHECK-LABEL: scaled_v8i16_sext:
; CHECK: @ %bb.0: @ %entry
@ -88,7 +88,7 @@ entry:
ret void
}
; Expand
; Expand - sext offset
define arm_aapcs_vfpcc void @scaled_v8f16_sext(i16* %base, <8 x i16>* %offptr, <8 x half> %input) {
; CHECK-LABEL: scaled_v8f16_sext:
; CHECK: @ %bb.0: @ %entry

View File

@ -65,7 +65,7 @@ entry:
ret void
}
; Expand
; Expand - sext offsets
define arm_aapcs_vfpcc void @unscaled_v8i16_sext(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) {
; CHECK-LABEL: unscaled_v8i16_sext:
; CHECK: @ %bb.0: @ %entry
@ -105,7 +105,7 @@ entry:
ret void
}
; Expand
; Expand - sext offsets
define arm_aapcs_vfpcc void @unscaled_v8f16_sext(i8* %base, <8 x i16>* %offptr, <8 x half> %input) {
; CHECK-LABEL: unscaled_v8f16_sext:
; CHECK: @ %bb.0: @ %entry
@ -139,7 +139,7 @@ entry:
ret void
}
; Expand
; Expand - i32 offsets
define arm_aapcs_vfpcc void @unscaled_v8i16_noext(i8* %base, <8 x i32>* %offptr, <8 x i16> %input) {
; CHECK-LABEL: unscaled_v8i16_noext:
; CHECK: @ %bb.0: @ %entry
@ -178,7 +178,7 @@ entry:
ret void
}
; Expand
; Expand - i32 offsets
define arm_aapcs_vfpcc void @unscaled_v8f16_noext(i8* %base, <8 x i32>* %offptr, <8 x half> %input) {
; CHECK-LABEL: unscaled_v8f16_noext:
; CHECK: @ %bb.0: @ %entry
@ -243,7 +243,7 @@ entry:
ret void
}
; Expand ?
; Expand - sext offsets
define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(i8* %base, <8 x i8>* %offptr, <8 x i64> %input) {
; CHECK-LABEL: trunc_signed_unscaled_i64_i8:
; CHECK: @ %bb.0: @ %entry
@ -287,7 +287,6 @@ entry:
ret void
}
; Expand ?
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <8 x i8>* %offptr, <8 x i64> %input) {
; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8:
; CHECK: @ %bb.0: @ %entry
@ -323,7 +322,7 @@ entry:
ret void
}
; Expand ?
; Expand - sext offsets
define arm_aapcs_vfpcc void @trunc_signed_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) {
; CHECK-LABEL: trunc_signed_unscaled_i32_i8:
; CHECK: @ %bb.0: @ %entry
@ -360,7 +359,6 @@ entry:
ret void
}
; Expand ?
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) {
; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
; CHECK: @ %bb.0: @ %entry
@ -391,7 +389,7 @@ entry:
ret void
}
; Expand ?
; Expand - sext offsets
define arm_aapcs_vfpcc void @trunc_signed_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) {
; CHECK-LABEL: trunc_signed_unscaled_i16_i8:
; CHECK: @ %bb.0: @ %entry

View File

@ -16,7 +16,7 @@ entry:
ret void
}
; Expand
; Expanded ?
define arm_aapcs_vfpcc void @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr, <8 x i8> %input) {
; CHECK-LABEL: unscaled_v8i8_i8:
; CHECK: @ %bb.0: @ %entry
@ -79,7 +79,7 @@ entry:
ret void
}
; Expand
; Expand - sext offsets
define arm_aapcs_vfpcc void @unscaled_v16i8_sext(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) {
; CHECK-LABEL: unscaled_v16i8_sext:
; CHECK: @ %bb.0: @ %entry
@ -142,7 +142,7 @@ entry:
ret void
}
; Expand
; Expand - sext offsets
define arm_aapcs_vfpcc void @unscaled_v16i8_i16(i8* %base, <16 x i16>* %offptr, <16 x i8> %input) {
; CHECK-LABEL: unscaled_v16i8_i16:
; CHECK: @ %bb.0: @ %entry
@ -205,7 +205,7 @@ entry:
ret void
}
; Expand
; Could be manually scaled offsets
define arm_aapcs_vfpcc void @unscaled_v16i8_scaled(i32* %base, <16 x i8>* %offptr, <16 x i8> %input) {
; CHECK-LABEL: unscaled_v16i8_scaled:
; CHECK: @ %bb.0: @ %entry
@ -273,7 +273,7 @@ entry:
ret void
}
; Expand
; Expand - large offsets
define arm_aapcs_vfpcc void @unscaled_v16i8_i8_next(i8* %base, <16 x i32>* %offptr, <16 x i8> %input) {
; CHECK-LABEL: unscaled_v16i8_i8_next:
; CHECK: @ %bb.0: @ %entry
@ -335,7 +335,6 @@ entry:
ret void
}
; Expand
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <16 x i8>* %offptr, <16 x i64> %input) {
; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8:
; CHECK: @ %bb.0: @ %entry
@ -396,7 +395,6 @@ entry:
ret void
}
; Expand
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <16 x i8>* %offptr, <16 x i32> %input) {
; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
; CHECK: @ %bb.0: @ %entry
@ -441,7 +439,6 @@ entry:
ret void
}
; Expand
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <16 x i8>* %offptr, <16 x i16> %input) {
; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8:
; CHECK: @ %bb.0: @ %entry

View File

@ -251,6 +251,24 @@ entry:
ret void
}
define arm_aapcs_vfpcc void @ptr_v4i16_dup(i32 %v, <4 x i16*> %offs) {
; CHECK-LABEL: ptr_v4i16_dup:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r1, r2, d0
; CHECK-NEXT: vmov r3, r12, d1
; CHECK-NEXT: strh r0, [r1]
; CHECK-NEXT: strh r0, [r2]
; CHECK-NEXT: strh r0, [r3]
; CHECK-NEXT: strh.w r0, [r12]
; CHECK-NEXT: bx lr
entry:
%ext = trunc i32 %v to i16
%splatinsert = insertelement <4 x i16> poison, i16 %ext, i32 0
%splat = shufflevector <4 x i16> %splatinsert, <4 x i16> poison, <4 x i32> zeroinitializer
call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %splat, <4 x i16*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
ret void
}
; Expand
define arm_aapcs_vfpcc void @ptr_v8i16_trunc(<8 x i32> %v, <8 x i16*>* %offptr) {
; CHECK-LABEL: ptr_v8i16_trunc:
@ -314,6 +332,42 @@ entry:
ret void
}
define arm_aapcs_vfpcc void @ptr_v4f16(<4 x half> %v, <4 x half*>* %offptr) {
; CHECK-LABEL: ptr_v4f16:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vmovx.f16 s8, s0
; CHECK-NEXT: vmov r0, r1, d2
; CHECK-NEXT: vstr.16 s0, [r0]
; CHECK-NEXT: vstr.16 s8, [r1]
; CHECK-NEXT: vmov r0, r1, d3
; CHECK-NEXT: vmovx.f16 s0, s1
; CHECK-NEXT: vstr.16 s1, [r0]
; CHECK-NEXT: vstr.16 s0, [r1]
; CHECK-NEXT: bx lr
entry:
%offs = load <4 x half*>, <4 x half*>* %offptr, align 4
call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> %v, <4 x half*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
ret void
}
define arm_aapcs_vfpcc void @ptr_v4f16_dup(half %v, <4 x half*> %offs) {
; CHECK-LABEL: ptr_v4f16_dup:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vmov r0, r1, d2
; CHECK-NEXT: vmov r2, r3, d3
; CHECK-NEXT: vstr.16 s0, [r0]
; CHECK-NEXT: vstr.16 s0, [r1]
; CHECK-NEXT: vstr.16 s0, [r2]
; CHECK-NEXT: vstr.16 s0, [r3]
; CHECK-NEXT: bx lr
entry:
%splatinsert = insertelement <4 x half> poison, half %v, i32 0
%splat = shufflevector <4 x half> %splatinsert, <4 x half> poison, <4 x i32> zeroinitializer
call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> %splat, <4 x half*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
ret void
}
; i8
; Expand.
@ -473,14 +527,14 @@ define void @foo_ptr_p_int32_t(i32* %dest, i32** %src, i32 %n) {
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB16_1: @ %vector.body
; CHECK-NEXT: .LBB19_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vptt.i32 ne, q0, zr
; CHECK-NEXT: vldrwt.u32 q1, [r0], #16
; CHECK-NEXT: vstrwt.32 q1, [q0]
; CHECK-NEXT: bne .LBB16_1
; CHECK-NEXT: bne .LBB19_1
; CHECK-NEXT: @ %bb.2: @ %for.end
; CHECK-NEXT: bx lr
entry:
@ -513,14 +567,14 @@ define void @foo_ptr_p_float(float* %dest, float** %src, i32 %n) {
; CHECK-NEXT: cmp r3, #1
; CHECK-NEXT: it lt
; CHECK-NEXT: bxlt lr
; CHECK-NEXT: .LBB17_1: @ %vector.body
; CHECK-NEXT: .LBB20_1: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
; CHECK-NEXT: subs r2, #4
; CHECK-NEXT: vptt.i32 ne, q0, zr
; CHECK-NEXT: vldrwt.u32 q1, [r0], #16
; CHECK-NEXT: vstrwt.32 q1, [q0]
; CHECK-NEXT: bne .LBB17_1
; CHECK-NEXT: bne .LBB20_1
; CHECK-NEXT: @ %bb.2: @ %for.end
; CHECK-NEXT: bx lr
entry: