mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-01-31 20:51:52 +01:00
[ARM] Rejig some of the MVE gather/scatter lowering pass. NFC
This adjusts some of how the gather/scatter lowering pass passes around data and where certain gathers/scatters are created from. It should not effect code generation on its own, but allows other patches to more clearly reason about the code. A number of extra test cases were also added for smaller gathers/ scatters that can be extended, and some of the test comments were updated.
This commit is contained in:
parent
30958bdc95
commit
e55f90b072
@ -84,8 +84,8 @@ private:
|
||||
// Check for a getelementptr and deduce base and offsets from it, on success
|
||||
// returning the base directly and the offsets indirectly using the Offsets
|
||||
// argument
|
||||
Value *checkGEP(Value *&Offsets, FixedVectorType *Ty, GetElementPtrInst *GEP,
|
||||
IRBuilder<> &Builder);
|
||||
Value *decomposeGEP(Value *&Offsets, FixedVectorType *Ty,
|
||||
GetElementPtrInst *GEP, IRBuilder<> &Builder);
|
||||
// Compute the scale of this gather/scatter instruction
|
||||
int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize);
|
||||
// If the value is a constant, or derived from constants via additions
|
||||
@ -123,8 +123,7 @@ private:
|
||||
|
||||
// QI gathers and scatters can increment their offsets on their own if
|
||||
// the increment is a constant value (digit)
|
||||
Value *tryCreateIncrementingGatScat(IntrinsicInst *I, Value *BasePtr,
|
||||
Value *Ptr, GetElementPtrInst *GEP,
|
||||
Value *tryCreateIncrementingGatScat(IntrinsicInst *I, Value *Ptr,
|
||||
IRBuilder<> &Builder);
|
||||
// QI gathers/scatters can increment their offsets on their own if the
|
||||
// increment is a constant value (digit) - this creates a writeback QI
|
||||
@ -214,9 +213,10 @@ static bool checkOffsetSize(Value *Offsets, unsigned TargetElemCount) {
|
||||
return true;
|
||||
}
|
||||
|
||||
Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, FixedVectorType *Ty,
|
||||
GetElementPtrInst *GEP,
|
||||
IRBuilder<> &Builder) {
|
||||
Value *MVEGatherScatterLowering::decomposeGEP(Value *&Offsets,
|
||||
FixedVectorType *Ty,
|
||||
GetElementPtrInst *GEP,
|
||||
IRBuilder<> &Builder) {
|
||||
if (!GEP) {
|
||||
LLVM_DEBUG(dbgs() << "masked gathers/scatters: no getelementpointer "
|
||||
<< "found\n");
|
||||
@ -372,7 +372,10 @@ Value *MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
|
||||
Builder.SetCurrentDebugLocation(I->getDebugLoc());
|
||||
|
||||
Instruction *Root = I;
|
||||
Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Root, Builder);
|
||||
|
||||
Value *Load = tryCreateIncrementingGatScat(I, Ptr, Builder);
|
||||
if (!Load)
|
||||
Load = tryCreateMaskedGatherOffset(I, Ptr, Root, Builder);
|
||||
if (!Load)
|
||||
Load = tryCreateMaskedGatherBase(I, Ptr, Builder);
|
||||
if (!Load)
|
||||
@ -478,14 +481,9 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
|
||||
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
|
||||
Value *Offsets;
|
||||
Value *BasePtr =
|
||||
checkGEP(Offsets, cast<FixedVectorType>(ResultTy), GEP, Builder);
|
||||
decomposeGEP(Offsets, cast<FixedVectorType>(ResultTy), GEP, Builder);
|
||||
if (!BasePtr)
|
||||
return nullptr;
|
||||
// Check whether the offset is a constant increment that could be merged into
|
||||
// a QI gather
|
||||
Value *Load = tryCreateIncrementingGatScat(I, BasePtr, Offsets, GEP, Builder);
|
||||
if (Load)
|
||||
return Load;
|
||||
|
||||
int Scale =
|
||||
computeScale(GEP->getSourceElementType()->getPrimitiveSizeInBits(),
|
||||
@ -533,7 +531,9 @@ Value *MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) {
|
||||
Builder.SetInsertPoint(I);
|
||||
Builder.SetCurrentDebugLocation(I->getDebugLoc());
|
||||
|
||||
Value *Store = tryCreateMaskedScatterOffset(I, Ptr, Builder);
|
||||
Value *Store = tryCreateIncrementingGatScat(I, Ptr, Builder);
|
||||
if (!Store)
|
||||
Store = tryCreateMaskedScatterOffset(I, Ptr, Builder);
|
||||
if (!Store)
|
||||
Store = tryCreateMaskedScatterBase(I, Ptr, Builder);
|
||||
if (!Store)
|
||||
@ -598,6 +598,7 @@ Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
|
||||
Value *Mask = I->getArgOperand(3);
|
||||
Type *InputTy = Input->getType();
|
||||
Type *MemoryTy = InputTy;
|
||||
|
||||
LLVM_DEBUG(dbgs() << "masked scatters: getelementpointer found. Storing"
|
||||
<< " to base + vector of offsets\n");
|
||||
// If the input has been truncated, try to integrate that trunc into the
|
||||
@ -619,15 +620,10 @@ Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
|
||||
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
|
||||
Value *Offsets;
|
||||
Value *BasePtr =
|
||||
checkGEP(Offsets, cast<FixedVectorType>(InputTy), GEP, Builder);
|
||||
decomposeGEP(Offsets, cast<FixedVectorType>(InputTy), GEP, Builder);
|
||||
if (!BasePtr)
|
||||
return nullptr;
|
||||
// Check whether the offset is a constant increment that could be merged into
|
||||
// a QI gather
|
||||
Value *Store =
|
||||
tryCreateIncrementingGatScat(I, BasePtr, Offsets, GEP, Builder);
|
||||
if (Store)
|
||||
return Store;
|
||||
|
||||
int Scale =
|
||||
computeScale(GEP->getSourceElementType()->getPrimitiveSizeInBits(),
|
||||
MemoryTy->getScalarSizeInBits());
|
||||
@ -652,21 +648,28 @@ Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
|
||||
}
|
||||
|
||||
Value *MVEGatherScatterLowering::tryCreateIncrementingGatScat(
|
||||
IntrinsicInst *I, Value *BasePtr, Value *Offsets, GetElementPtrInst *GEP,
|
||||
IRBuilder<> &Builder) {
|
||||
IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder) {
|
||||
FixedVectorType *Ty;
|
||||
if (I->getIntrinsicID() == Intrinsic::masked_gather)
|
||||
Ty = cast<FixedVectorType>(I->getType());
|
||||
else
|
||||
Ty = cast<FixedVectorType>(I->getArgOperand(0)->getType());
|
||||
|
||||
// Incrementing gathers only exist for v4i32
|
||||
if (Ty->getNumElements() != 4 ||
|
||||
Ty->getScalarSizeInBits() != 32)
|
||||
if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32)
|
||||
return nullptr;
|
||||
// Incrementing gathers are not beneficial outside of a loop
|
||||
Loop *L = LI->getLoopFor(I->getParent());
|
||||
if (L == nullptr)
|
||||
// Incrementing gathers are not beneficial outside of a loop
|
||||
return nullptr;
|
||||
|
||||
// Decompose the GEP into Base and Offsets
|
||||
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
|
||||
Value *Offsets;
|
||||
Value *BasePtr = decomposeGEP(Offsets, Ty, GEP, Builder);
|
||||
if (!BasePtr)
|
||||
return nullptr;
|
||||
|
||||
LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to build incrementing "
|
||||
"wb gather/scatter\n");
|
||||
|
||||
@ -689,6 +692,7 @@ Value *MVEGatherScatterLowering::tryCreateIncrementingGatScat(
|
||||
if (Load != nullptr)
|
||||
return Load;
|
||||
}
|
||||
|
||||
LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to build incrementing "
|
||||
"non-wb gather/scatter\n");
|
||||
|
||||
|
@ -273,6 +273,25 @@ entry:
|
||||
ret <4 x i32> %ext
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <4 x i16> @ptr_v4i16(<4 x i16*>* %offptr) {
|
||||
; CHECK-LABEL: ptr_v4i16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0]
|
||||
; CHECK-NEXT: vmov r0, r1, d1
|
||||
; CHECK-NEXT: vmov r2, r3, d0
|
||||
; CHECK-NEXT: ldrh r0, [r0]
|
||||
; CHECK-NEXT: ldrh r2, [r2]
|
||||
; CHECK-NEXT: ldrh r1, [r1]
|
||||
; CHECK-NEXT: ldrh r3, [r3]
|
||||
; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
|
||||
; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4
|
||||
%gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
|
||||
ret <4 x i16> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_sext(<8 x i16*>* %offptr) {
|
||||
; CHECK-LABEL: ptr_v8i16_sext:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
@ -369,6 +388,25 @@ entry:
|
||||
ret <8 x half> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <4 x half> @ptr_v4f16(<4 x half*>* %offptr) {
|
||||
; CHECK-LABEL: ptr_v4f16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0]
|
||||
; CHECK-NEXT: vmov r0, r1, d2
|
||||
; CHECK-NEXT: vldr.16 s8, [r1]
|
||||
; CHECK-NEXT: vldr.16 s0, [r0]
|
||||
; CHECK-NEXT: vmov r0, r1, d3
|
||||
; CHECK-NEXT: vins.f16 s0, s8
|
||||
; CHECK-NEXT: vldr.16 s4, [r1]
|
||||
; CHECK-NEXT: vldr.16 s1, [r0]
|
||||
; CHECK-NEXT: vins.f16 s1, s4
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x half*>, <4 x half*>* %offptr, align 4
|
||||
%gather = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x half> undef)
|
||||
ret <4 x half> %gather
|
||||
}
|
||||
|
||||
; i8
|
||||
|
||||
define arm_aapcs_vfpcc <16 x i8> @ptr_i8(<16 x i8*>* %offptr) {
|
||||
@ -499,6 +537,40 @@ entry:
|
||||
ret <8 x i16> %ext
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x i8> @ptr_v8i8(<8 x i8*>* %offptr) {
|
||||
; CHECK-LABEL: ptr_v8i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: .save {r4, r5, r6, lr}
|
||||
; CHECK-NEXT: push {r4, r5, r6, lr}
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0, #16]
|
||||
; CHECK-NEXT: vmov r1, r2, d0
|
||||
; CHECK-NEXT: vmov r3, r12, d1
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0]
|
||||
; CHECK-NEXT: vmov r4, r5, d0
|
||||
; CHECK-NEXT: vmov r0, lr, d1
|
||||
; CHECK-NEXT: ldrb r1, [r1]
|
||||
; CHECK-NEXT: ldrb r6, [r3]
|
||||
; CHECK-NEXT: ldrb r2, [r2]
|
||||
; CHECK-NEXT: ldrb r4, [r4]
|
||||
; CHECK-NEXT: ldrb r5, [r5]
|
||||
; CHECK-NEXT: vmov.16 q0[0], r4
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: vmov.16 q0[1], r5
|
||||
; CHECK-NEXT: ldrb.w r3, [lr]
|
||||
; CHECK-NEXT: vmov.16 q0[2], r0
|
||||
; CHECK-NEXT: ldrb.w r12, [r12]
|
||||
; CHECK-NEXT: vmov.16 q0[3], r3
|
||||
; CHECK-NEXT: vmov.16 q0[4], r1
|
||||
; CHECK-NEXT: vmov.16 q0[5], r2
|
||||
; CHECK-NEXT: vmov.16 q0[6], r6
|
||||
; CHECK-NEXT: vmov.16 q0[7], r12
|
||||
; CHECK-NEXT: pop {r4, r5, r6, pc}
|
||||
entry:
|
||||
%offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4
|
||||
%gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
|
||||
ret <8 x i8> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_sext32(<4 x i8*>* %offptr) {
|
||||
; CHECK-LABEL: ptr_v4i8_sext32:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
@ -543,6 +615,25 @@ entry:
|
||||
ret <4 x i32> %ext
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <4 x i8> @ptr_v4i8(<4 x i8*>* %offptr) {
|
||||
; CHECK-LABEL: ptr_v4i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r0]
|
||||
; CHECK-NEXT: vmov r0, r1, d1
|
||||
; CHECK-NEXT: vmov r2, r3, d0
|
||||
; CHECK-NEXT: ldrb r0, [r0]
|
||||
; CHECK-NEXT: ldrb r2, [r2]
|
||||
; CHECK-NEXT: ldrb r1, [r1]
|
||||
; CHECK-NEXT: ldrb r3, [r3]
|
||||
; CHECK-NEXT: vmov q0[2], q0[0], r2, r0
|
||||
; CHECK-NEXT: vmov q0[3], q0[1], r3, r1
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4
|
||||
%gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %offs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
|
||||
ret <4 x i8> %gather
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_sext32(<8 x i8*>* %offptr) {
|
||||
; CHECK-LABEL: ptr_v8i8_sext32:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
@ -623,17 +714,17 @@ define void @foo_ptr_p_int32_t(i32* %dest, i32** %src, i32 %n) {
|
||||
; CHECK-NEXT: cmp r2, #1
|
||||
; CHECK-NEXT: it lt
|
||||
; CHECK-NEXT: poplt {r7, pc}
|
||||
; CHECK-NEXT: .LBB22_1: @ %vector.body.preheader
|
||||
; CHECK-NEXT: .LBB26_1: @ %vector.body.preheader
|
||||
; CHECK-NEXT: subs r2, #4
|
||||
; CHECK-NEXT: movs r3, #1
|
||||
; CHECK-NEXT: add.w lr, r3, r2, lsr #2
|
||||
; CHECK-NEXT: .LBB22_2: @ %vector.body
|
||||
; CHECK-NEXT: .LBB26_2: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
|
||||
; CHECK-NEXT: vptt.i32 ne, q0, zr
|
||||
; CHECK-NEXT: vldrwt.u32 q1, [q0]
|
||||
; CHECK-NEXT: vstrwt.32 q1, [r0], #16
|
||||
; CHECK-NEXT: le lr, .LBB22_2
|
||||
; CHECK-NEXT: le lr, .LBB26_2
|
||||
; CHECK-NEXT: @ %bb.3: @ %for.end
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
entry:
|
||||
@ -668,17 +759,17 @@ define void @foo_ptr_p_float(float* %dest, float** %src, i32 %n) {
|
||||
; CHECK-NEXT: cmp r2, #1
|
||||
; CHECK-NEXT: it lt
|
||||
; CHECK-NEXT: poplt {r7, pc}
|
||||
; CHECK-NEXT: .LBB23_1: @ %vector.body.preheader
|
||||
; CHECK-NEXT: .LBB27_1: @ %vector.body.preheader
|
||||
; CHECK-NEXT: subs r2, #4
|
||||
; CHECK-NEXT: movs r3, #1
|
||||
; CHECK-NEXT: add.w lr, r3, r2, lsr #2
|
||||
; CHECK-NEXT: .LBB23_2: @ %vector.body
|
||||
; CHECK-NEXT: .LBB27_2: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
|
||||
; CHECK-NEXT: vptt.i32 ne, q0, zr
|
||||
; CHECK-NEXT: vldrwt.u32 q1, [q0]
|
||||
; CHECK-NEXT: vstrwt.32 q1, [r0], #16
|
||||
; CHECK-NEXT: le lr, .LBB23_2
|
||||
; CHECK-NEXT: le lr, .LBB27_2
|
||||
; CHECK-NEXT: @ %bb.3: @ %for.end
|
||||
; CHECK-NEXT: pop {r7, pc}
|
||||
entry:
|
||||
|
@ -47,7 +47,7 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Expand
|
||||
; Expand - sext offset
|
||||
define arm_aapcs_vfpcc void @scaled_v8i16_sext(i16* %base, <8 x i16>* %offptr, <8 x i16> %input) {
|
||||
; CHECK-LABEL: scaled_v8i16_sext:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
@ -88,7 +88,7 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Expand
|
||||
; Expand - sext offset
|
||||
define arm_aapcs_vfpcc void @scaled_v8f16_sext(i16* %base, <8 x i16>* %offptr, <8 x half> %input) {
|
||||
; CHECK-LABEL: scaled_v8f16_sext:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
|
@ -65,7 +65,7 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Expand
|
||||
; Expand - sext offsets
|
||||
define arm_aapcs_vfpcc void @unscaled_v8i16_sext(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) {
|
||||
; CHECK-LABEL: unscaled_v8i16_sext:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
@ -105,7 +105,7 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Expand
|
||||
; Expand - sext offsets
|
||||
define arm_aapcs_vfpcc void @unscaled_v8f16_sext(i8* %base, <8 x i16>* %offptr, <8 x half> %input) {
|
||||
; CHECK-LABEL: unscaled_v8f16_sext:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
@ -139,7 +139,7 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Expand
|
||||
; Expand - i32 offsets
|
||||
define arm_aapcs_vfpcc void @unscaled_v8i16_noext(i8* %base, <8 x i32>* %offptr, <8 x i16> %input) {
|
||||
; CHECK-LABEL: unscaled_v8i16_noext:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
@ -178,7 +178,7 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Expand
|
||||
; Expand - i32 offsets
|
||||
define arm_aapcs_vfpcc void @unscaled_v8f16_noext(i8* %base, <8 x i32>* %offptr, <8 x half> %input) {
|
||||
; CHECK-LABEL: unscaled_v8f16_noext:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
@ -243,7 +243,7 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Expand ?
|
||||
; Expand - sext offsets
|
||||
define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(i8* %base, <8 x i8>* %offptr, <8 x i64> %input) {
|
||||
; CHECK-LABEL: trunc_signed_unscaled_i64_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
@ -287,7 +287,6 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Expand ?
|
||||
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <8 x i8>* %offptr, <8 x i64> %input) {
|
||||
; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
@ -323,7 +322,7 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Expand ?
|
||||
; Expand - sext offsets
|
||||
define arm_aapcs_vfpcc void @trunc_signed_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) {
|
||||
; CHECK-LABEL: trunc_signed_unscaled_i32_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
@ -360,7 +359,6 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Expand ?
|
||||
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) {
|
||||
; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
@ -391,7 +389,7 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Expand ?
|
||||
; Expand - sext offsets
|
||||
define arm_aapcs_vfpcc void @trunc_signed_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) {
|
||||
; CHECK-LABEL: trunc_signed_unscaled_i16_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
|
@ -16,7 +16,7 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Expand
|
||||
; Expanded ?
|
||||
define arm_aapcs_vfpcc void @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr, <8 x i8> %input) {
|
||||
; CHECK-LABEL: unscaled_v8i8_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
@ -79,7 +79,7 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Expand
|
||||
; Expand - sext offsets
|
||||
define arm_aapcs_vfpcc void @unscaled_v16i8_sext(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) {
|
||||
; CHECK-LABEL: unscaled_v16i8_sext:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
@ -142,7 +142,7 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Expand
|
||||
; Expand - sext offsets
|
||||
define arm_aapcs_vfpcc void @unscaled_v16i8_i16(i8* %base, <16 x i16>* %offptr, <16 x i8> %input) {
|
||||
; CHECK-LABEL: unscaled_v16i8_i16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
@ -205,7 +205,7 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Expand
|
||||
; Could be manually scaled offsets
|
||||
define arm_aapcs_vfpcc void @unscaled_v16i8_scaled(i32* %base, <16 x i8>* %offptr, <16 x i8> %input) {
|
||||
; CHECK-LABEL: unscaled_v16i8_scaled:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
@ -273,7 +273,7 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Expand
|
||||
; Expand - large offsets
|
||||
define arm_aapcs_vfpcc void @unscaled_v16i8_i8_next(i8* %base, <16 x i32>* %offptr, <16 x i8> %input) {
|
||||
; CHECK-LABEL: unscaled_v16i8_i8_next:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
@ -335,7 +335,6 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Expand
|
||||
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <16 x i8>* %offptr, <16 x i64> %input) {
|
||||
; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
@ -396,7 +395,6 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Expand
|
||||
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <16 x i8>* %offptr, <16 x i32> %input) {
|
||||
; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
@ -441,7 +439,6 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Expand
|
||||
define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <16 x i8>* %offptr, <16 x i16> %input) {
|
||||
; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
|
@ -251,6 +251,24 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc void @ptr_v4i16_dup(i32 %v, <4 x i16*> %offs) {
|
||||
; CHECK-LABEL: ptr_v4i16_dup:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmov r1, r2, d0
|
||||
; CHECK-NEXT: vmov r3, r12, d1
|
||||
; CHECK-NEXT: strh r0, [r1]
|
||||
; CHECK-NEXT: strh r0, [r2]
|
||||
; CHECK-NEXT: strh r0, [r3]
|
||||
; CHECK-NEXT: strh.w r0, [r12]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%ext = trunc i32 %v to i16
|
||||
%splatinsert = insertelement <4 x i16> poison, i16 %ext, i32 0
|
||||
%splat = shufflevector <4 x i16> %splatinsert, <4 x i16> poison, <4 x i32> zeroinitializer
|
||||
call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %splat, <4 x i16*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Expand
|
||||
define arm_aapcs_vfpcc void @ptr_v8i16_trunc(<8 x i32> %v, <8 x i16*>* %offptr) {
|
||||
; CHECK-LABEL: ptr_v8i16_trunc:
|
||||
@ -314,6 +332,42 @@ entry:
|
||||
ret void
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc void @ptr_v4f16(<4 x half> %v, <4 x half*>* %offptr) {
|
||||
; CHECK-LABEL: ptr_v4f16:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vldrw.u32 q1, [r0]
|
||||
; CHECK-NEXT: vmovx.f16 s8, s0
|
||||
; CHECK-NEXT: vmov r0, r1, d2
|
||||
; CHECK-NEXT: vstr.16 s0, [r0]
|
||||
; CHECK-NEXT: vstr.16 s8, [r1]
|
||||
; CHECK-NEXT: vmov r0, r1, d3
|
||||
; CHECK-NEXT: vmovx.f16 s0, s1
|
||||
; CHECK-NEXT: vstr.16 s1, [r0]
|
||||
; CHECK-NEXT: vstr.16 s0, [r1]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%offs = load <4 x half*>, <4 x half*>* %offptr, align 4
|
||||
call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> %v, <4 x half*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
|
||||
ret void
|
||||
}
|
||||
|
||||
define arm_aapcs_vfpcc void @ptr_v4f16_dup(half %v, <4 x half*> %offs) {
|
||||
; CHECK-LABEL: ptr_v4f16_dup:
|
||||
; CHECK: @ %bb.0: @ %entry
|
||||
; CHECK-NEXT: vmov r0, r1, d2
|
||||
; CHECK-NEXT: vmov r2, r3, d3
|
||||
; CHECK-NEXT: vstr.16 s0, [r0]
|
||||
; CHECK-NEXT: vstr.16 s0, [r1]
|
||||
; CHECK-NEXT: vstr.16 s0, [r2]
|
||||
; CHECK-NEXT: vstr.16 s0, [r3]
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
%splatinsert = insertelement <4 x half> poison, half %v, i32 0
|
||||
%splat = shufflevector <4 x half> %splatinsert, <4 x half> poison, <4 x i32> zeroinitializer
|
||||
call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> %splat, <4 x half*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
|
||||
ret void
|
||||
}
|
||||
|
||||
; i8
|
||||
|
||||
; Expand.
|
||||
@ -473,14 +527,14 @@ define void @foo_ptr_p_int32_t(i32* %dest, i32** %src, i32 %n) {
|
||||
; CHECK-NEXT: cmp r3, #1
|
||||
; CHECK-NEXT: it lt
|
||||
; CHECK-NEXT: bxlt lr
|
||||
; CHECK-NEXT: .LBB16_1: @ %vector.body
|
||||
; CHECK-NEXT: .LBB19_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
|
||||
; CHECK-NEXT: subs r2, #4
|
||||
; CHECK-NEXT: vptt.i32 ne, q0, zr
|
||||
; CHECK-NEXT: vldrwt.u32 q1, [r0], #16
|
||||
; CHECK-NEXT: vstrwt.32 q1, [q0]
|
||||
; CHECK-NEXT: bne .LBB16_1
|
||||
; CHECK-NEXT: bne .LBB19_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.end
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
@ -513,14 +567,14 @@ define void @foo_ptr_p_float(float* %dest, float** %src, i32 %n) {
|
||||
; CHECK-NEXT: cmp r3, #1
|
||||
; CHECK-NEXT: it lt
|
||||
; CHECK-NEXT: bxlt lr
|
||||
; CHECK-NEXT: .LBB17_1: @ %vector.body
|
||||
; CHECK-NEXT: .LBB20_1: @ %vector.body
|
||||
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
|
||||
; CHECK-NEXT: subs r2, #4
|
||||
; CHECK-NEXT: vptt.i32 ne, q0, zr
|
||||
; CHECK-NEXT: vldrwt.u32 q1, [r0], #16
|
||||
; CHECK-NEXT: vstrwt.32 q1, [q0]
|
||||
; CHECK-NEXT: bne .LBB17_1
|
||||
; CHECK-NEXT: bne .LBB20_1
|
||||
; CHECK-NEXT: @ %bb.2: @ %for.end
|
||||
; CHECK-NEXT: bx lr
|
||||
entry:
|
||||
|
Loading…
x
Reference in New Issue
Block a user