1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-01-31 20:51:52 +01:00

[SVE][CodeGen] Improve codegen of scalable masked scatters

If the scatter store is able to perform the sign/zero extend of
its index, this is folded into the instruction with refineIndexType().
Additionally, refineUniformBase() will return the base pointer and index
from an add + splat_vector.

Reviewed By: sdesmalen

Differential Revision: https://reviews.llvm.org/D90942
This commit is contained in:
Kerry McLaughlin 2020-11-13 10:51:17 +00:00
parent 69f353da9d
commit c26a89a1b4
7 changed files with 118 additions and 370 deletions

View File

@ -1318,6 +1318,10 @@ public:
getIndexedMaskedStoreAction(IdxMode, VT.getSimpleVT()) == Custom);
}
// Returns true if VT is a legal index type for masked gathers/scatters
// on this target
virtual bool shouldRemoveExtendFromGSIndex(EVT VT) const { return false; }
/// Return how the condition code should be treated: either it is legal, needs
/// to be expanded to some other code sequence, or the target has a custom
/// expander for it.

View File

@ -9399,16 +9399,74 @@ static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1));
}
bool refineUniformBase(SDValue &BasePtr, SDValue &Index, SelectionDAG &DAG) {
if (!isNullConstant(BasePtr) || Index.getOpcode() != ISD::ADD)
return false;
// For now we check only the LHS of the add.
SDValue LHS = Index.getOperand(0);
SDValue SplatVal = DAG.getSplatValue(LHS);
if (!SplatVal)
return false;
BasePtr = SplatVal;
Index = Index.getOperand(1);
return true;
}
// Fold sext/zext of index into index type.
bool refineIndexType(MaskedScatterSDNode *MSC, SDValue &Index, bool Scaled,
SelectionDAG &DAG) {
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDValue Op = Index.getOperand(0);
if (Index.getOpcode() == ISD::ZERO_EXTEND) {
MSC->setIndexType(Scaled ? ISD::UNSIGNED_SCALED : ISD::UNSIGNED_UNSCALED);
if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) {
Index = Op;
return true;
}
}
if (Index.getOpcode() == ISD::SIGN_EXTEND) {
MSC->setIndexType(Scaled ? ISD::SIGNED_SCALED : ISD::SIGNED_UNSCALED);
if (TLI.shouldRemoveExtendFromGSIndex(Op.getValueType())) {
Index = Op;
return true;
}
}
return false;
}
SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
SDValue Mask = MSC->getMask();
SDValue Chain = MSC->getChain();
SDValue Index = MSC->getIndex();
SDValue Scale = MSC->getScale();
SDValue StoreVal = MSC->getValue();
SDValue BasePtr = MSC->getBasePtr();
SDLoc DL(N);
// Zap scatters with a zero mask.
if (ISD::isBuildVectorAllZeros(Mask.getNode()))
return Chain;
if (refineUniformBase(BasePtr, Index, DAG)) {
SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
return DAG.getMaskedScatter(
DAG.getVTList(MVT::Other), StoreVal.getValueType(), DL, Ops,
MSC->getMemOperand(), MSC->getIndexType(), MSC->isTruncatingStore());
}
if (refineIndexType(MSC, Index, MSC->isIndexScaled(), DAG)) {
SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, Scale};
return DAG.getMaskedScatter(
DAG.getVTList(MVT::Other), StoreVal.getValueType(), DL, Ops,
MSC->getMemOperand(), MSC->getIndexType(), MSC->isTruncatingStore());
}
return SDValue();
}

View File

@ -3705,6 +3705,14 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
}
bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
if (VT.getVectorElementType() == MVT::i32 &&
VT.getVectorElementCount().getKnownMinValue() >= 4)
return true;
return false;
}
bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
return ExtVal.getValueType().isScalableVector();
}
@ -3792,11 +3800,8 @@ SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
}
if (getScatterIndexIsExtended(Index)) {
if (Index.getOpcode() == ISD::AND)
IsSigned = false;
if (getScatterIndexIsExtended(Index))
Index = Index.getOperand(0);
}
SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT};
return DAG.getNode(getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend), DL,

View File

@ -980,6 +980,7 @@ private:
return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
}
bool shouldRemoveExtendFromGSIndex(EVT VT) const override;
bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
bool mayBeEmittedAsTailCall(const CallInst *CI) const override;

View File

@ -166,15 +166,7 @@ define void @masked_scatter_nxv2f64_zext(<vscale x 2 x double> %data, double* %b
define void @masked_scatter_nxv4i16_sext(<vscale x 4 x i16> %data, i16* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4i16_sext:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: uunpkhi z2.d, z1.s
; CHECK-NEXT: uunpklo z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, sxtw #1]
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, sxtw #1]
; CHECK-NEXT: st1h { z0.s }, p0, [x0, z1.s, sxtw #1]
; CHECK-NEXT: ret
%ext = sext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
%ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %ext
@ -185,15 +177,7 @@ define void @masked_scatter_nxv4i16_sext(<vscale x 4 x i16> %data, i16* %base, <
define void @masked_scatter_nxv4i32_sext(<vscale x 4 x i32> %data, i32* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4i32_sext:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: uunpkhi z2.d, z1.s
; CHECK-NEXT: uunpklo z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1w { z3.d }, p2, [x0, z1.d, sxtw #2]
; CHECK-NEXT: st1w { z0.d }, p0, [x0, z2.d, sxtw #2]
; CHECK-NEXT: st1w { z0.s }, p0, [x0, z1.s, sxtw #2]
; CHECK-NEXT: ret
%ext = sext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
%ptrs = getelementptr i32, i32* %base, <vscale x 4 x i64> %ext
@ -204,15 +188,7 @@ define void @masked_scatter_nxv4i32_sext(<vscale x 4 x i32> %data, i32* %base, <
define void @masked_scatter_nxv4f16_sext(<vscale x 4 x half> %data, half* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4f16_sext:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: uunpkhi z2.d, z1.s
; CHECK-NEXT: uunpklo z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, sxtw #1]
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, sxtw #1]
; CHECK-NEXT: st1h { z0.s }, p0, [x0, z1.s, sxtw #1]
; CHECK-NEXT: ret
%ext = sext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
%ptrs = getelementptr half, half* %base, <vscale x 4 x i64> %ext
@ -223,15 +199,7 @@ define void @masked_scatter_nxv4f16_sext(<vscale x 4 x half> %data, half* %base,
define void @masked_scatter_nxv4bf16_sext(<vscale x 4 x bfloat> %data, bfloat* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv4bf16_sext:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: uunpkhi z2.d, z1.s
; CHECK-NEXT: uunpklo z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, sxtw #1]
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, sxtw #1]
; CHECK-NEXT: st1h { z0.s }, p0, [x0, z1.s, sxtw #1]
; CHECK-NEXT: ret
%ext = sext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
%ptrs = getelementptr bfloat, bfloat* %base, <vscale x 4 x i64> %ext
@ -242,15 +210,7 @@ define void @masked_scatter_nxv4bf16_sext(<vscale x 4 x bfloat> %data, bfloat* %
define void @masked_scatter_nxv4f32_sext(<vscale x 4 x float> %data, float* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv4f32_sext:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: uunpkhi z2.d, z1.s
; CHECK-NEXT: uunpklo z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1w { z3.d }, p2, [x0, z1.d, sxtw #2]
; CHECK-NEXT: st1w { z0.d }, p0, [x0, z2.d, sxtw #2]
; CHECK-NEXT: st1w { z0.s }, p0, [x0, z1.s, sxtw #2]
; CHECK-NEXT: ret
%ext = sext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
%ptrs = getelementptr float, float* %base, <vscale x 4 x i64> %ext
@ -261,15 +221,7 @@ define void @masked_scatter_nxv4f32_sext(<vscale x 4 x float> %data, float* %bas
define void @masked_scatter_nxv4i16_zext(<vscale x 4 x i16> %data, i16* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4i16_zext:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: uunpkhi z2.d, z1.s
; CHECK-NEXT: uunpklo z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, uxtw #1]
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, uxtw #1]
; CHECK-NEXT: st1h { z0.s }, p0, [x0, z1.s, uxtw #1]
; CHECK-NEXT: ret
%ext = zext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
%ptrs = getelementptr i16, i16* %base, <vscale x 4 x i64> %ext
@ -280,15 +232,7 @@ define void @masked_scatter_nxv4i16_zext(<vscale x 4 x i16> %data, i16* %base, <
define void @masked_scatter_nxv4i32_zext(<vscale x 4 x i32> %data, i32* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4i32_zext:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: uunpkhi z2.d, z1.s
; CHECK-NEXT: uunpklo z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1w { z3.d }, p2, [x0, z1.d, uxtw #2]
; CHECK-NEXT: st1w { z0.d }, p0, [x0, z2.d, uxtw #2]
; CHECK-NEXT: st1w { z0.s }, p0, [x0, z1.s, uxtw #2]
; CHECK-NEXT: ret
%ext = zext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
%ptrs = getelementptr i32, i32* %base, <vscale x 4 x i64> %ext
@ -299,15 +243,7 @@ define void @masked_scatter_nxv4i32_zext(<vscale x 4 x i32> %data, i32* %base, <
define void @masked_scatter_nxv4f16_zext(<vscale x 4 x half> %data, half* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4f16_zext:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: uunpkhi z2.d, z1.s
; CHECK-NEXT: uunpklo z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, uxtw #1]
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, uxtw #1]
; CHECK-NEXT: st1h { z0.s }, p0, [x0, z1.s, uxtw #1]
; CHECK-NEXT: ret
%ext = zext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
%ptrs = getelementptr half, half* %base, <vscale x 4 x i64> %ext
@ -318,15 +254,7 @@ define void @masked_scatter_nxv4f16_zext(<vscale x 4 x half> %data, half* %base,
define void @masked_scatter_nxv4bf16_zext(<vscale x 4 x bfloat> %data, bfloat* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv4bf16_zext:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: uunpkhi z2.d, z1.s
; CHECK-NEXT: uunpklo z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x0, z1.d, uxtw #1]
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z2.d, uxtw #1]
; CHECK-NEXT: st1h { z0.s }, p0, [x0, z1.s, uxtw #1]
; CHECK-NEXT: ret
%ext = zext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
%ptrs = getelementptr bfloat, bfloat* %base, <vscale x 4 x i64> %ext
@ -337,15 +265,7 @@ define void @masked_scatter_nxv4bf16_zext(<vscale x 4 x bfloat> %data, bfloat* %
define void @masked_scatter_nxv4f32_zext(<vscale x 4 x float> %data, float* %base, <vscale x 4 x i32> %indexes, <vscale x 4 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv4f32_zext:
; CHECK: // %bb.0:
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: uunpkhi z2.d, z1.s
; CHECK-NEXT: uunpklo z1.d, z1.s
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1w { z3.d }, p2, [x0, z1.d, uxtw #2]
; CHECK-NEXT: st1w { z0.d }, p0, [x0, z2.d, uxtw #2]
; CHECK-NEXT: st1w { z0.s }, p0, [x0, z1.s, uxtw #2]
; CHECK-NEXT: ret
%ext = zext <vscale x 4 x i32> %indexes to <vscale x 4 x i64>
%ptrs = getelementptr float, float* %base, <vscale x 4 x i64> %ext

View File

@ -8,12 +8,7 @@
define void @masked_scatter_nxv2i8_sext_offsets(<vscale x 2 x i8> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i8_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1b { z0.d }, p0, [x0, z1.d, sxtw]
; CHECK-NEXT: ret
%offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
@ -25,12 +20,7 @@ define void @masked_scatter_nxv2i8_sext_offsets(<vscale x 2 x i8> %data, i8* %ba
define void @masked_scatter_nxv2i16_sext_offsets(<vscale x 2 x i16> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i16_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, sxtw]
; CHECK-NEXT: ret
%offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
@ -42,12 +32,7 @@ define void @masked_scatter_nxv2i16_sext_offsets(<vscale x 2 x i16> %data, i8* %
define void @masked_scatter_nxv2i32_sext_offsets(<vscale x 2 x i32> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i32_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, sxtw]
; CHECK-NEXT: ret
%offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
@ -59,12 +44,7 @@ define void @masked_scatter_nxv2i32_sext_offsets(<vscale x 2 x i32> %data, i8* %
define void @masked_scatter_nxv2i64_sext_offsets(<vscale x 2 x i64> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i64_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, sxtw]
; CHECK-NEXT: ret
%offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
@ -76,12 +56,7 @@ define void @masked_scatter_nxv2i64_sext_offsets(<vscale x 2 x i64> %data, i8* %
define void @masked_scatter_nxv2f16_sext_offsets(<vscale x 2 x half> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2f16_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, sxtw]
; CHECK-NEXT: ret
%offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
@ -93,12 +68,7 @@ define void @masked_scatter_nxv2f16_sext_offsets(<vscale x 2 x half> %data, i8*
define void @masked_scatter_nxv2bf16_sext_offsets(<vscale x 2 x bfloat> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv2bf16_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, sxtw]
; CHECK-NEXT: ret
%offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
@ -110,12 +80,7 @@ define void @masked_scatter_nxv2bf16_sext_offsets(<vscale x 2 x bfloat> %data, i
define void @masked_scatter_nxv2f32_sext_offsets(<vscale x 2 x float> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2f32_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, sxtw]
; CHECK-NEXT: ret
%offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
@ -127,12 +92,7 @@ define void @masked_scatter_nxv2f32_sext_offsets(<vscale x 2 x float> %data, i8*
define void @masked_scatter_nxv2f64_sext_offsets(<vscale x 2 x double> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2f64_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p1.d
; CHECK-NEXT: sxtw z1.d, p1/m, z1.d
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, sxtw]
; CHECK-NEXT: ret
%offsets = sext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
@ -144,11 +104,7 @@ define void @masked_scatter_nxv2f64_sext_offsets(<vscale x 2 x double> %data, i8
define void @masked_scatter_nxv2i8_zext_offsets(<vscale x 2 x i8> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i8_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1b { z0.d }, p0, [x0, z1.d, uxtw]
; CHECK-NEXT: ret
%offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
@ -160,11 +116,7 @@ define void @masked_scatter_nxv2i8_zext_offsets(<vscale x 2 x i8> %data, i8* %ba
define void @masked_scatter_nxv2i16_zext_offsets(<vscale x 2 x i16> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i16_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, uxtw]
; CHECK-NEXT: ret
%offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
@ -176,11 +128,7 @@ define void @masked_scatter_nxv2i16_zext_offsets(<vscale x 2 x i16> %data, i8* %
define void @masked_scatter_nxv2i32_zext_offsets(<vscale x 2 x i32> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i32_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, uxtw]
; CHECK-NEXT: ret
%offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
@ -192,11 +140,7 @@ define void @masked_scatter_nxv2i32_zext_offsets(<vscale x 2 x i32> %data, i8* %
define void @masked_scatter_nxv2i64_zext_offsets(<vscale x 2 x i64> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i64_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, uxtw]
; CHECK-NEXT: ret
%offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
@ -208,11 +152,7 @@ define void @masked_scatter_nxv2i64_zext_offsets(<vscale x 2 x i64> %data, i8* %
define void @masked_scatter_nxv2f16_zext_offsets(<vscale x 2 x half> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2f16_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, uxtw]
; CHECK-NEXT: ret
%offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
@ -224,11 +164,7 @@ define void @masked_scatter_nxv2f16_zext_offsets(<vscale x 2 x half> %data, i8*
define void @masked_scatter_nxv2bf16_zext_offsets(<vscale x 2 x bfloat> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv2bf16_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d, uxtw]
; CHECK-NEXT: ret
%offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
@ -240,11 +176,7 @@ define void @masked_scatter_nxv2bf16_zext_offsets(<vscale x 2 x bfloat> %data, i
define void @masked_scatter_nxv2f32_zext_offsets(<vscale x 2 x float> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2f32_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d, uxtw]
; CHECK-NEXT: ret
%offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
@ -256,11 +188,7 @@ define void @masked_scatter_nxv2f32_zext_offsets(<vscale x 2 x float> %data, i8*
define void @masked_scatter_nxv2f64_zext_offsets(<vscale x 2 x double> %data, i8* %base, <vscale x 2 x i32> %i32offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2f64_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: and z1.d, z1.d, #0xffffffff
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d, uxtw]
; CHECK-NEXT: ret
%offsets = zext <vscale x 2 x i32> %i32offsets to <vscale x 2 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
@ -275,19 +203,7 @@ define void @masked_scatter_nxv2f64_zext_offsets(<vscale x 2 x double> %data, i8
define void @masked_scatter_nxv4i8_sext_offsets(<vscale x 4 x i8> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4i8_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: sunpklo z3.d, z1.s
; CHECK-NEXT: sunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1b { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1b { z0.s }, p0, [x0, z1.s, sxtw]
; CHECK-NEXT: ret
%offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
@ -299,19 +215,7 @@ define void @masked_scatter_nxv4i8_sext_offsets(<vscale x 4 x i8> %data, i8* %ba
define void @masked_scatter_nxv4i16_sext_offsets(<vscale x 4 x i16> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4i16_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: sunpklo z3.d, z1.s
; CHECK-NEXT: sunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1h { z0.s }, p0, [x0, z1.s, sxtw]
; CHECK-NEXT: ret
%offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
@ -323,19 +227,7 @@ define void @masked_scatter_nxv4i16_sext_offsets(<vscale x 4 x i16> %data, i8* %
define void @masked_scatter_nxv4i32_sext_offsets(<vscale x 4 x i32> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4i32_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: sunpklo z3.d, z1.s
; CHECK-NEXT: sunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1w { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1w { z0.s }, p0, [x0, z1.s, sxtw]
; CHECK-NEXT: ret
%offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
@ -347,19 +239,7 @@ define void @masked_scatter_nxv4i32_sext_offsets(<vscale x 4 x i32> %data, i8* %
define void @masked_scatter_nxv4f16_sext_offsets(<vscale x 4 x half> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4f16_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: sunpklo z3.d, z1.s
; CHECK-NEXT: sunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1h { z0.s }, p0, [x0, z1.s, sxtw]
; CHECK-NEXT: ret
%offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
@ -371,19 +251,7 @@ define void @masked_scatter_nxv4f16_sext_offsets(<vscale x 4 x half> %data, i8*
define void @masked_scatter_nxv4bf16_sext_offsets(<vscale x 4 x bfloat> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv4bf16_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: sunpklo z3.d, z1.s
; CHECK-NEXT: sunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1h { z0.s }, p0, [x0, z1.s, sxtw]
; CHECK-NEXT: ret
%offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
@ -395,19 +263,7 @@ define void @masked_scatter_nxv4bf16_sext_offsets(<vscale x 4 x bfloat> %data, i
define void @masked_scatter_nxv4f32_sext_offsets(<vscale x 4 x float> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv4f32_sext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: sunpklo z3.d, z1.s
; CHECK-NEXT: sunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1w { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1w { z0.s }, p0, [x0, z1.s, sxtw]
; CHECK-NEXT: ret
%offsets = sext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
@ -419,19 +275,7 @@ define void @masked_scatter_nxv4f32_sext_offsets(<vscale x 4 x float> %data, i8*
define void @masked_scatter_nxv4i8_zext_offsets(<vscale x 4 x i8> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4i8_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: uunpklo z3.d, z1.s
; CHECK-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1b { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1b { z0.s }, p0, [x0, z1.s, uxtw]
; CHECK-NEXT: ret
%offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
@ -443,19 +287,7 @@ define void @masked_scatter_nxv4i8_zext_offsets(<vscale x 4 x i8> %data, i8* %ba
define void @masked_scatter_nxv4i16_zext_offsets(<vscale x 4 x i16> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4i16_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: uunpklo z3.d, z1.s
; CHECK-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1h { z0.s }, p0, [x0, z1.s, uxtw]
; CHECK-NEXT: ret
%offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
@ -467,19 +299,7 @@ define void @masked_scatter_nxv4i16_zext_offsets(<vscale x 4 x i16> %data, i8* %
define void @masked_scatter_nxv4i32_zext_offsets(<vscale x 4 x i32> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4i32_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: uunpklo z3.d, z1.s
; CHECK-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1w { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1w { z0.s }, p0, [x0, z1.s, uxtw]
; CHECK-NEXT: ret
%offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
@ -491,19 +311,7 @@ define void @masked_scatter_nxv4i32_zext_offsets(<vscale x 4 x i32> %data, i8* %
define void @masked_scatter_nxv4f16_zext_offsets(<vscale x 4 x half> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv4f16_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: uunpklo z3.d, z1.s
; CHECK-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1h { z0.s }, p0, [x0, z1.s, uxtw]
; CHECK-NEXT: ret
%offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
@ -515,19 +323,7 @@ define void @masked_scatter_nxv4f16_zext_offsets(<vscale x 4 x half> %data, i8*
define void @masked_scatter_nxv4bf16_zext_offsets(<vscale x 4 x bfloat> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv4bf16_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: uunpklo z3.d, z1.s
; CHECK-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1h { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1h { z0.s }, p0, [x0, z1.s, uxtw]
; CHECK-NEXT: ret
%offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets
@ -539,19 +335,7 @@ define void @masked_scatter_nxv4bf16_zext_offsets(<vscale x 4 x bfloat> %data, i
define void @masked_scatter_nxv4f32_zext_offsets(<vscale x 4 x float> %data, i8* %base, <vscale x 4 x i32> %i32offsets, <vscale x 4 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv4f32_zext_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: uunpklo z3.d, z1.s
; CHECK-NEXT: uunpkhi z1.d, z1.s
; CHECK-NEXT: pfalse p1.b
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: add z2.d, z2.d, z3.d
; CHECK-NEXT: uunpklo z3.d, z0.s
; CHECK-NEXT: uunpkhi z0.d, z0.s
; CHECK-NEXT: zip1 p2.s, p0.s, p1.s
; CHECK-NEXT: zip2 p0.s, p0.s, p1.s
; CHECK-NEXT: st1w { z3.d }, p2, [x8, z2.d]
; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1w { z0.s }, p0, [x0, z1.s, uxtw]
; CHECK-NEXT: ret
%offsets = zext <vscale x 4 x i32> %i32offsets to <vscale x 4 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 4 x i64> %offsets

View File

@ -8,10 +8,7 @@
define void @masked_scatter_nxv2i8_unscaled_64bit_offsets(<vscale x 2 x i8> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i8_unscaled_64bit_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1b { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1b { z0.d }, p0, [x0, z1.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i8*>
@ -22,10 +19,7 @@ define void @masked_scatter_nxv2i8_unscaled_64bit_offsets(<vscale x 2 x i8> %dat
define void @masked_scatter_nxv2i16_unscaled_64bit_offsets(<vscale x 2 x i16> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i16_unscaled_64bit_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
@ -36,10 +30,7 @@ define void @masked_scatter_nxv2i16_unscaled_64bit_offsets(<vscale x 2 x i16> %d
define void @masked_scatter_nxv2i32_unscaled_64bit_offsets(<vscale x 2 x i32> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i32_unscaled_64bit_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
@ -50,10 +41,7 @@ define void @masked_scatter_nxv2i32_unscaled_64bit_offsets(<vscale x 2 x i32> %d
define void @masked_scatter_nxv2i64_unscaled_64bit_offsets(<vscale x 2 x i64> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2i64_unscaled_64bit_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
@ -64,10 +52,7 @@ define void @masked_scatter_nxv2i64_unscaled_64bit_offsets(<vscale x 2 x i64> %d
define void @masked_scatter_nxv2f16_unscaled_64bit_offsets(<vscale x 2 x half> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind {
; CHECK-LABEL: masked_scatter_nxv2f16_unscaled_64bit_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
@ -78,10 +63,7 @@ define void @masked_scatter_nxv2f16_unscaled_64bit_offsets(<vscale x 2 x half> %
define void @masked_scatter_nxv2bf16_unscaled_64bit_offsets(<vscale x 2 x bfloat> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv2bf16_unscaled_64bit_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1h { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1h { z0.d }, p0, [x0, z1.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x bfloat*>
@ -92,10 +74,7 @@ define void @masked_scatter_nxv2bf16_unscaled_64bit_offsets(<vscale x 2 x bfloat
define void @masked_scatter_nxv2f32_unscaled_64bit_offsets(<vscale x 2 x float> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv2f32_unscaled_64bit_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1w { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1w { z0.d }, p0, [x0, z1.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
@ -106,10 +85,7 @@ define void @masked_scatter_nxv2f32_unscaled_64bit_offsets(<vscale x 2 x float>
define void @masked_scatter_nxv2f64_unscaled_64bit_offsets(<vscale x 2 x double> %data, i8* %base, <vscale x 2 x i64> %offsets, <vscale x 2 x i1> %masks) nounwind #0 {
; CHECK-LABEL: masked_scatter_nxv2f64_unscaled_64bit_offsets:
; CHECK: // %bb.0:
; CHECK-NEXT: mov z2.d, x0
; CHECK-NEXT: mov x8, xzr
; CHECK-NEXT: add z1.d, z2.d, z1.d
; CHECK-NEXT: st1d { z0.d }, p0, [x8, z1.d]
; CHECK-NEXT: st1d { z0.d }, p0, [x0, z1.d]
; CHECK-NEXT: ret
%byte_ptrs = getelementptr i8, i8* %base, <vscale x 2 x i64> %offsets
%ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>