1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-11-22 02:33:06 +01:00

[AArch64][SVE] Add support for fixed length MSCATTER/MGATHER

Since gather lowering can now lower to nodes that may need expansion via
the vector legalizer, do MGATHER lowering via vector legalizer.

Additionally, as part of adding passthru support for fixed typed
gathers, fix passthru support for scalable types.

Depends on D104910

Differential Revision: https://reviews.llvm.org/D104217
This commit is contained in:
Bradley Smith 2021-06-07 14:34:03 +01:00
parent d59877b7e5
commit e07552eb3f
6 changed files with 2265 additions and 23 deletions

View File

@ -457,6 +457,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
case ISD::USHLSAT:
case ISD::FP_TO_SINT_SAT:
case ISD::FP_TO_UINT_SAT:
case ISD::MGATHER:
Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
break;
case ISD::SMULFIX:

View File

@ -1211,15 +1211,16 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP, VT, Custom);
setOperationAction(ISD::UINT_TO_FP, VT, Custom);
}
}
// NEON doesn't support masked loads or stores, but SVE does
for (auto VT :
{MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::MSTORE, VT, Custom);
}
// NEON doesn't support masked loads/stores/gathers/scatters, but SVE does
for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v2f32, MVT::v4f32, MVT::v1f64,
MVT::v2f64, MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
MVT::v2i32, MVT::v4i32, MVT::v1i64, MVT::v2i64}) {
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::MSTORE, VT, Custom);
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
}
for (MVT VT : MVT::fp_scalable_vector_valuetypes()) {
@ -1513,7 +1514,9 @@ void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
setOperationAction(ISD::FSUB, VT, Custom);
setOperationAction(ISD::FTRUNC, VT, Custom);
setOperationAction(ISD::LOAD, VT, Custom);
setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
setOperationAction(ISD::MSTORE, VT, Custom);
setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::MULHS, VT, Custom);
@ -2228,6 +2231,13 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
// Lowering Code
//===----------------------------------------------------------------------===//
// Forward declarations of SVE fixed length lowering helpers
static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT);
static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V);
static SDValue convertFixedMaskToScalableVector(SDValue Mask,
SelectionDAG &DAG);
/// isZerosVector - Check whether SDNode N is a zero-filled vector.
static bool isZerosVector(const SDNode *N) {
// Look through a bit convert.
@ -4232,6 +4242,12 @@ void selectGatherScatterAddrMode(SDValue &BasePtr, SDValue &Index, EVT MemVT,
if (!isNullConstant(BasePtr))
return;
// FIXME: This will not match for fixed vector type codegen as the nodes in
// question will have fixed<->scalable conversions around them. This should be
// moved to a DAG combine or complex pattern so that is executes after all of
// the fixed vector insert and extracts have been removed. This deficiency
// will result in a sub-optimal addressing mode being used, i.e. an ADD not
// being folded into the scatter/gather.
ConstantSDNode *Offset = nullptr;
if (Index.getOpcode() == ISD::ADD)
if (auto SplatVal = DAG.getSplatValue(Index.getOperand(1))) {
@ -4276,6 +4292,8 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
MaskedGatherSDNode *MGT = cast<MaskedGatherSDNode>(Op);
assert(MGT && "Can only custom lower gather load nodes");
bool IsFixedLength = MGT->getMemoryVT().isFixedLengthVector();
SDValue Index = MGT->getIndex();
SDValue Chain = MGT->getChain();
SDValue PassThru = MGT->getPassThru();
@ -4294,6 +4312,7 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
bool ResNeedsSignExtend = ExtTy == ISD::EXTLOAD || ExtTy == ISD::SEXTLOAD;
EVT VT = PassThru.getSimpleValueType();
EVT IndexVT = Index.getSimpleValueType();
EVT MemVT = MGT->getMemoryVT();
SDValue InputVT = DAG.getValueType(MemVT);
@ -4301,14 +4320,27 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
!static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
return SDValue();
// Handle FP data by using an integer gather and casting the result.
if (VT.isFloatingPoint()) {
EVT PassThruVT = getPackedSVEVectorVT(VT.getVectorElementCount());
PassThru = getSVESafeBitCast(PassThruVT, PassThru, DAG);
if (IsFixedLength) {
assert(Subtarget->useSVEForFixedLengthVectors() &&
"Cannot lower when not using SVE for fixed vectors");
IndexVT = getContainerForFixedLengthVector(DAG, IndexVT);
MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
InputVT = DAG.getValueType(MemVT.changeTypeToInteger());
}
if (PassThru->isUndef() || isZerosVector(PassThru.getNode()))
PassThru = SDValue();
if (VT.isFloatingPoint() && !IsFixedLength) {
// Handle FP data by using an integer gather and casting the result.
if (PassThru) {
EVT PassThruVT = getPackedSVEVectorVT(VT.getVectorElementCount());
PassThru = getSVESafeBitCast(PassThruVT, PassThru, DAG);
}
InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
}
SDVTList VTs = DAG.getVTList(PassThru.getSimpleValueType(), MVT::Other);
SDVTList VTs = DAG.getVTList(IndexVT, MVT::Other);
if (getGatherScatterIndexIsExtended(Index))
Index = Index.getOperand(0);
@ -4320,15 +4352,36 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
if (ResNeedsSignExtend)
Opcode = getSignExtendedGatherOpcode(Opcode);
SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT, PassThru};
SDValue Gather = DAG.getNode(Opcode, DL, VTs, Ops);
if (VT.isFloatingPoint()) {
SDValue Cast = getSVESafeBitCast(VT, Gather, DAG);
return DAG.getMergeValues({Cast, Gather.getValue(1)}, DL);
if (IsFixedLength) {
if (Index.getSimpleValueType().isFixedLengthVector())
Index = convertToScalableVector(DAG, IndexVT, Index);
if (BasePtr.getSimpleValueType().isFixedLengthVector())
BasePtr = convertToScalableVector(DAG, IndexVT, BasePtr);
Mask = convertFixedMaskToScalableVector(Mask, DAG);
}
return Gather;
SDValue Ops[] = {Chain, Mask, BasePtr, Index, InputVT};
SDValue Result = DAG.getNode(Opcode, DL, VTs, Ops);
Chain = Result.getValue(1);
if (IsFixedLength) {
Result = convertFromScalableVector(
DAG, VT.changeVectorElementType(IndexVT.getVectorElementType()),
Result);
Result = DAG.getNode(ISD::TRUNCATE, DL, VT.changeTypeToInteger(), Result);
Result = DAG.getNode(ISD::BITCAST, DL, VT, Result);
if (PassThru)
Result = DAG.getSelect(DL, VT, MGT->getMask(), Result, PassThru);
} else {
if (PassThru)
Result = DAG.getSelect(DL, IndexVT, Mask, Result, PassThru);
if (VT.isFloatingPoint())
Result = getSVESafeBitCast(VT, Result, DAG);
}
return DAG.getMergeValues({Result, Chain}, DL);
}
SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
@ -4337,6 +4390,8 @@ SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(Op);
assert(MSC && "Can only custom lower scatter store nodes");
bool IsFixedLength = MSC->getMemoryVT().isFixedLengthVector();
SDValue Index = MSC->getIndex();
SDValue Chain = MSC->getChain();
SDValue StoreVal = MSC->getValue();
@ -4353,6 +4408,7 @@ SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
Index.getSimpleValueType().getVectorElementType() == MVT::i32;
EVT VT = StoreVal.getSimpleValueType();
EVT IndexVT = Index.getSimpleValueType();
SDVTList VTs = DAG.getVTList(MVT::Other);
EVT MemVT = MSC->getMemoryVT();
SDValue InputVT = DAG.getValueType(MemVT);
@ -4361,8 +4417,21 @@ SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
!static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
return SDValue();
// Handle FP data by casting the data so an integer scatter can be used.
if (VT.isFloatingPoint()) {
if (IsFixedLength) {
assert(Subtarget->useSVEForFixedLengthVectors() &&
"Cannot lower when not using SVE for fixed vectors");
IndexVT = getContainerForFixedLengthVector(DAG, IndexVT);
MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType());
InputVT = DAG.getValueType(MemVT.changeTypeToInteger());
StoreVal =
DAG.getNode(ISD::BITCAST, DL, VT.changeTypeToInteger(), StoreVal);
StoreVal = DAG.getNode(
ISD::ANY_EXTEND, DL,
VT.changeVectorElementType(IndexVT.getVectorElementType()), StoreVal);
StoreVal = convertToScalableVector(DAG, IndexVT, StoreVal);
} else if (VT.isFloatingPoint()) {
// Handle FP data by casting the data so an integer scatter can be used.
EVT StoreValVT = getPackedSVEVectorVT(VT.getVectorElementCount());
StoreVal = getSVESafeBitCast(StoreValVT, StoreVal, DAG);
InputVT = DAG.getValueType(MemVT.changeVectorElementTypeToInteger());
@ -4375,6 +4444,14 @@ SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode,
/*isGather=*/false, DAG);
if (IsFixedLength) {
if (Index.getSimpleValueType().isFixedLengthVector())
Index = convertToScalableVector(DAG, IndexVT, Index);
if (BasePtr.getSimpleValueType().isFixedLengthVector())
BasePtr = convertToScalableVector(DAG, IndexVT, BasePtr);
Mask = convertFixedMaskToScalableVector(Mask, DAG);
}
SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT};
return DAG.getNode(Opcode, DL, VTs, Ops);
}

View File

@ -243,7 +243,13 @@ public:
}
bool isLegalMaskedGatherScatter(Type *DataType) const {
if (isa<FixedVectorType>(DataType) || !ST->hasSVE())
if (!ST->hasSVE())
return false;
// For fixed vectors, scalarize if not using SVE for them.
auto *DataTypeFVTy = dyn_cast<FixedVectorType>(DataType);
if (DataTypeFVTy && (!ST->useSVEForFixedLengthVectors() ||
DataTypeFVTy->getNumElements() < 2))
return false;
return isLegalElementTypeForSVE(DataType->getScalarType());

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,999 @@
; RUN: llc -aarch64-sve-vector-bits-min=128 -asm-verbose=0 < %s | FileCheck %s -check-prefix=NO_SVE
; RUN: llc -aarch64-sve-vector-bits-min=256 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256
; RUN: llc -aarch64-sve-vector-bits-min=384 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK
; RUN: llc -aarch64-sve-vector-bits-min=512 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=640 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=768 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=896 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1024 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1152 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1280 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1408 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1536 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1664 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1792 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=1920 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 -asm-verbose=0 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512
target triple = "aarch64-unknown-linux-gnu"
; Don't use SVE when its registers are no bigger than NEON.
; NO_SVE-NOT: ptrue
;
; ST1B
;
define void @masked_scatter_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v2i8:
; CHECK: ldrb [[VALS_LO:w[0-9]+]], [x0]
; CHECK-NEXT: ldrb [[VALS_HI:w[0-9]+]], [x0, #1]
; CHECK-NEXT: ldr q[[PTRS:[0-9]+]], [x1]
; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].s, vl2
; CHECK-NEXT: fmov s[[VALS:[0-9]+]], [[VALS_LO]]
; CHECK-NEXT: mov v[[VALS]].s[1], [[VALS_HI]]
; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].2s, v[[VALS]].2s, #0
; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG0]]/z, z[[CMP]].s, #0
; CHECK-NEXT: ushll v[[SHL:[0-9]+]].2d, v[[VALS]].2s, #0
; CHECK-NEXT: st1b { z[[SHL]].d }, [[MASK]], [z[[PTRS]].d]
; CHECK-NEXT: ret
%vals = load <2 x i8>, <2 x i8>* %a
%ptrs = load <2 x i8*>, <2 x i8*>* %b
%mask = icmp eq <2 x i8> %vals, zeroinitializer
call void @llvm.masked.scatter.v2i8(<2 x i8> %vals, <2 x i8*> %ptrs, i32 8, <2 x i1> %mask)
ret void
}
define void @masked_scatter_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v4i8:
; CHECK: ldr s[[VALS:[0-9]+]], [x0]
; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4
; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl4
; CHECK-NEXT: ushll [[SHL:v[0-9]+]].8h, v[[VALS]].8b, #0
; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].4h, [[SHL]].4h, #0
; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[VALS]].h
; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG1]]/z, z[[CMP]].h, #0
; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
; CHECK-NEXT: st1b { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; CHECK-NEXT: ret
%vals = load <4 x i8>, <4 x i8>* %a
%ptrs = load <4 x i8*>, <4 x i8*>* %b
%mask = icmp eq <4 x i8> %vals, zeroinitializer
call void @llvm.masked.scatter.v4i8(<4 x i8> %vals, <4 x i8*> %ptrs, i32 8, <4 x i1> %mask)
ret void
}
define void @masked_scatter_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v8i8:
; VBITS_EQ_256: ldr d[[VALS:[0-9]+]], [x0]
; VBITS_EQ_256-NEXT: add x8, x1, #32
; VBITS_EQ_256-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4
; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].h, vl4
; VBITS_EQ_256-NEXT: cmeq [[ZMSK:v[0-9]+]].8b, v[[VALS]].8b, #0
; VBITS_EQ_256-NEXT: zip1 [[VAL_LO:v[0-9]+]].8b, [[ZMSK]].8b, v[[VALS]].8b
; VBITS_EQ_256-NEXT: zip2 [[VAL_HI:v[0-9]+]].8b, [[ZMSK]].8b, v[[VALS]].8b
; VBITS_EQ_256-NEXT: shl [[SHL_LO:v[0-9]+]].4h, [[VAL_LO]].4h, #8
; VBITS_EQ_256-NEXT: shl [[SHL_HI:v[0-9]+]].4h, [[VAL_HI]].4h, #8
; VBITS_EQ_256-NEXT: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG0]]/z, [x8]
; VBITS_EQ_256-NEXT: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG0]]/z, [x1]
; VBITS_EQ_256-NEXT: sshr v[[SSHR_LO:[0-9]+]].4h, [[SHL_LO]].4h, #8
; VBITS_EQ_256-NEXT: sshr v[[SSHR_HI:[0-9]+]].4h, [[SHL_HI]].4h, #8
; VBITS_EQ_256-NEXT: cmpne [[MASK_LO:p[0-9]+]].h, [[PG1]]/z, z[[SSHR_LO]].h, #0
; VBITS_EQ_256-NEXT: cmpne [[MASK_HI:p[0-9]+]].h, [[PG1]]/z, z[[SSHR_HI]].h, #0
; VBITS_EQ_256-NEXT: zip1 v[[VALS2_LO:[0-9]+]].8b, v[[VALS]].8b, v[[VALS]].8b
; VBITS_EQ_256-NEXT: zip2 v[[VALS2_HI:[0-9]+]].8b, v[[VALS]].8b, v[[VALS]].8b
; VBITS_EQ_256-NEXT: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[VALS2_LO]].h
; VBITS_EQ_256-NEXT: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[VALS2_HI]].h
; VBITS_EQ_256-NEXT: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s
; VBITS_EQ_256-NEXT: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK1_HI]].s
; VBITS_EQ_256-NEXT: st1b { [[UPK2_LO]].d }, [[MASK_LO]], {{\[}}[[PTRS_LO]].d]
; VBITS_EQ_256-NEXT: st1b { [[UPK2_HI]].d }, [[MASK_HI]], {{\[}}[[PTRS_HI]].d]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_512: ldr d[[VALS:[0-9]+]], [x0]
; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].d, vl8
; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].b, vl8
; VBITS_GE_512-NEXT: cmeq v[[CMP:[0-9]+]].8b, v[[VALS]].8b, #0
; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].h, z[[VALS]].b
; VBITS_GE_512-NEXT: uunpklo [[UPK2:z[0-9]+]].s, [[UPK1]].h
; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].b, [[PG1]]/z, z[[CMP]].b, #0
; VBITS_GE_512-NEXT: uunpklo [[UPK3:z[0-9]+]].d, [[UPK2]].s
; VBITS_GE_512-NEXT: st1b { [[UPK3]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; VBITS_GE_512-NEXT: ret
%vals = load <8 x i8>, <8 x i8>* %a
%ptrs = load <8 x i8*>, <8 x i8*>* %b
%mask = icmp eq <8 x i8> %vals, zeroinitializer
call void @llvm.masked.scatter.v8i8(<8 x i8> %vals, <8 x i8*> %ptrs, i32 8, <8 x i1> %mask)
ret void
}
define void @masked_scatter_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v16i8:
; VBITS_GE_1024: ldr q[[VALS:[0-9]+]], [x0]
; VBITS_GE_1024-NEXT: ptrue [[PG0:p[0-9]+]].d, vl16
; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].b, vl16
; VBITS_GE_1024-NEXT: cmeq v[[CMP:[0-9]+]].16b, v[[VALS]].16b, #0
; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].h, z[[VALS]].b
; VBITS_GE_1024-NEXT: uunpklo [[UPK2:z[0-9]+]].s, [[UPK1]].h
; VBITS_GE_1024-NEXT: cmpne [[MASK:p[0-9]+]].b, [[PG1]]/z, z[[CMP]].b, #0
; VBITS_GE_1024-NEXT: uunpklo [[UPK3:z[0-9]+]].d, [[UPK2]].s
; VBITS_GE_1024-NEXT: st1b { [[UPK3]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; VBITS_GE_1024-NEXT: ret
%vals = load <16 x i8>, <16 x i8>* %a
%ptrs = load <16 x i8*>, <16 x i8*>* %b
%mask = icmp eq <16 x i8> %vals, zeroinitializer
call void @llvm.masked.scatter.v16i8(<16 x i8> %vals, <16 x i8*> %ptrs, i32 8, <16 x i1> %mask)
ret void
}
define void @masked_scatter_v32i8(<32 x i8>* %a, <32 x i8*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v32i8:
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].b, vl32
; VBITS_GE_2048-NEXT: ld1b { [[VALS:z[0-9]+]].b }, [[PG0]]/z, [x0]
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
; VBITS_GE_2048-NEXT: cmpeq [[MASK:p[0-9]+]].b, [[PG0]]/z, [[VALS]].b, #0
; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].h, [[VALS]].b
; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].s, [[UPK1]].h
; VBITS_GE_2048-NEXT: uunpklo [[UPK3:z[0-9]+]].d, [[UPK2]].s
; VBITS_GE_2048-NEXT: st1b { [[UPK3]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; VBITS_GE_2048-NEXT: ret
%vals = load <32 x i8>, <32 x i8>* %a
%ptrs = load <32 x i8*>, <32 x i8*>* %b
%mask = icmp eq <32 x i8> %vals, zeroinitializer
call void @llvm.masked.scatter.v32i8(<32 x i8> %vals, <32 x i8*> %ptrs, i32 8, <32 x i1> %mask)
ret void
}
;
; ST1H
;
define void @masked_scatter_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v2i16:
; CHECK: ldrh [[VALS_LO:w[0-9]+]], [x0]
; CHECK-NEXT: ldrh [[VALS_HI:w[0-9]+]], [x0, #2]
; CHECK-NEXT: ldr q[[PTRS:[0-9]+]], [x1]
; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].s, vl2
; CHECK-NEXT: fmov s[[VALS:[0-9]+]], [[VALS_LO]]
; CHECK-NEXT: mov v[[VALS]].s[1], [[VALS_HI]]
; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].2s, v[[VALS]].2s, #0
; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG0]]/z, z[[CMP]].s, #0
; CHECK-NEXT: ushll v[[SHL:[0-9]+]].2d, v[[VALS]].2s, #0
; CHECK-NEXT: st1h { z[[SHL]].d }, [[MASK]], [z[[PTRS]].d]
; CHECK-NEXT: ret
%vals = load <2 x i16>, <2 x i16>* %a
%ptrs = load <2 x i16*>, <2 x i16*>* %b
%mask = icmp eq <2 x i16> %vals, zeroinitializer
call void @llvm.masked.scatter.v2i16(<2 x i16> %vals, <2 x i16*> %ptrs, i32 8, <2 x i1> %mask)
ret void
}
define void @masked_scatter_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v4i16:
; CHECK: ldr d[[VALS:[0-9]+]], [x0]
; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4
; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl4
; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].4h, v[[VALS]].4h, #0
; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[VALS]].h
; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG1]]/z, z[[CMP]].h, #0
; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
; CHECK-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; CHECK-NEXT: ret
%vals = load <4 x i16>, <4 x i16>* %a
%ptrs = load <4 x i16*>, <4 x i16*>* %b
%mask = icmp eq <4 x i16> %vals, zeroinitializer
call void @llvm.masked.scatter.v4i16(<4 x i16> %vals, <4 x i16*> %ptrs, i32 8, <4 x i1> %mask)
ret void
}
define void @masked_scatter_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v8i16:
; VBITS_EQ_256: ldr q[[VALS:[0-9]+]], [x0]
; VBITS_EQ_256-NEXT: add x8, x1, #32
; VBITS_EQ_256-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4
; VBITS_EQ_256-NEXT: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG0]]/z, [x8]
; VBITS_EQ_256-NEXT: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG0]]/z, [x1]
; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].h, vl4
; VBITS_EQ_256-NEXT: cmeq v[[ZMSK:[0-9]+]].8h, v[[VALS]].8h, #0
; VBITS_EQ_256-DAG: ext v[[EXT:[0-9]+]].16b, v[[VALS]].16b, v[[VALS]].16b, #8
; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].h, [[PG1]]/z, z[[ZMSK]].h, #0
; VBITS_EQ_256-DAG: ext v[[ZEXT:[0-9]+]].16b, v[[ZMSK]].16b, v[[ZMSK]].16b, #8
; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].h, [[PG1]]/z, z[[ZEXT]].h, #0
; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[VALS]].h
; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[EXT]].h
; VBITS_EQ_256-DAG: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s
; VBITS_EQ_256-DAG: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK1_HI]].s
; VBITS_EQ_256-DAG: st1h { [[UPK2_LO]].d }, [[MASK_LO]], {{\[}}[[PTRS_LO]].d]
; VBITS_EQ_256-DAG: st1h { [[UPK2_HI]].d }, [[MASK_HI]], {{\[}}[[PTRS_HI]].d]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_512: ldr q[[VALS:[0-9]+]], [x0]
; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].d, vl8
; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].h, vl8
; VBITS_GE_512-NEXT: cmeq v[[CMP:[0-9]+]].8h, v[[VALS]].8h, #0
; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[VALS]].h
; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG1]]/z, z[[CMP]].h, #0
; VBITS_GE_512-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
; VBITS_GE_512-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; VBITS_GE_512-NEXT: ret
%vals = load <8 x i16>, <8 x i16>* %a
%ptrs = load <8 x i16*>, <8 x i16*>* %b
%mask = icmp eq <8 x i16> %vals, zeroinitializer
call void @llvm.masked.scatter.v8i16(<8 x i16> %vals, <8 x i16*> %ptrs, i32 8, <8 x i1> %mask)
ret void
}
define void @masked_scatter_v16i16(<16 x i16>* %a, <16 x i16*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v16i16:
; VBITS_GE_1024: ptrue [[PG0:p[0-9]+]].h, vl16
; VBITS_GE_1024-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16
; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
; VBITS_GE_1024-NEXT: cmpeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0
; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[VALS]].h
; VBITS_GE_1024-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
; VBITS_GE_1024-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; VBITS_GE_1024-NEXT: ret
%vals = load <16 x i16>, <16 x i16>* %a
%ptrs = load <16 x i16*>, <16 x i16*>* %b
%mask = icmp eq <16 x i16> %vals, zeroinitializer
call void @llvm.masked.scatter.v16i16(<16 x i16> %vals, <16 x i16*> %ptrs, i32 8, <16 x i1> %mask)
ret void
}
define void @masked_scatter_v32i16(<32 x i16>* %a, <32 x i16*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v32i16:
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
; VBITS_GE_2048-NEXT: cmpeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0
; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[VALS]].h
; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
; VBITS_GE_2048-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; VBITS_GE_2048-NEXT: ret
%vals = load <32 x i16>, <32 x i16>* %a
%ptrs = load <32 x i16*>, <32 x i16*>* %b
%mask = icmp eq <32 x i16> %vals, zeroinitializer
call void @llvm.masked.scatter.v32i16(<32 x i16> %vals, <32 x i16*> %ptrs, i32 8, <32 x i1> %mask)
ret void
}
;
; ST1W
;
define void @masked_scatter_v2i32(<2 x i32>* %a, <2 x i32*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v2i32:
; CHECK: ldr d[[VALS:[0-9]+]], [x0]
; CHECK-NEXT: ldr q[[PTRS:[0-9]+]], [x1]
; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].s, vl2
; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].2s, v[[VALS]].2s, #0
; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG0]]/z, z[[CMP]].s, #0
; CHECK-NEXT: ushll v[[SHL:[0-9]+]].2d, v[[VALS]].2s, #0
; CHECK-NEXT: st1w { z[[SHL]].d }, [[MASK]], [z[[PTRS]].d]
; CHECK-NEXT: ret
%vals = load <2 x i32>, <2 x i32>* %a
%ptrs = load <2 x i32*>, <2 x i32*>* %b
%mask = icmp eq <2 x i32> %vals, zeroinitializer
call void @llvm.masked.scatter.v2i32(<2 x i32> %vals, <2 x i32*> %ptrs, i32 8, <2 x i1> %mask)
ret void
}
define void @masked_scatter_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v4i32:
; CHECK: ldr q[[VALS:[0-9]+]], [x0]
; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4
; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4
; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].4s, v[[VALS]].4s, #0
; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, z[[CMP]].s, #0
; CHECK-NEXT: uunpklo [[UPK:z[0-9]+]].d, z[[VALS]].s
; CHECK-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; CHECK-NEXT: ret
%vals = load <4 x i32>, <4 x i32>* %a
%ptrs = load <4 x i32*>, <4 x i32*>* %b
%mask = icmp eq <4 x i32> %vals, zeroinitializer
call void @llvm.masked.scatter.v4i32(<4 x i32> %vals, <4 x i32*> %ptrs, i32 8, <4 x i1> %mask)
ret void
}
define void @masked_scatter_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v8i32:
; VBITS_EQ_256: ptrue [[PG0:p[0-9]+]].s, vl8
; VBITS_EQ_256-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
; VBITS_EQ_256-NEXT: add x8, x1, #32
; VBITS_EQ_256-NEXT: ptrue [[PG1:p[0-9]+]].d, vl4
; VBITS_EQ_256-NEXT: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG1]]/z, [x8]
; VBITS_EQ_256-NEXT: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG1]]/z, [x1]
; VBITS_EQ_256-NEXT: cmpeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0
; VBITS_EQ_256-NEXT: add x8, sp, #32
; VBITS_EQ_256-NEXT: mov x9, sp
; VBITS_EQ_256-NEXT: mov [[MONE:z[0-9]+]].s, p1/z, #-1
; VBITS_EQ_256-NEXT: st1w { [[MONE]].s }, [[PG0]], [x8]
; VBITS_EQ_256-NEXT: st1w { [[VALS]].s }, [[PG0]], [x9]
; VBITS_EQ_256-NEXT: ldr q[[CMP_LO:[0-9]+]], [sp, #32]
; VBITS_EQ_256-NEXT: ldr q[[VAL_LO:[0-9]+]], [sp]
; VBITS_EQ_256-NEXT: ptrue [[PG2:p[0-9]+]].s, vl4
; VBITS_EQ_256-NEXT: cmpne [[MASK_LO:p[0-9]+]].s, [[PG2]]/z, z[[CMP_LO]].s, #0
; VBITS_EQ_256-NEXT: uunpklo [[UPK1_LO:z[0-9]+]].d, z[[VAL_LO]].s
; VBITS_EQ_256-NEXT: st1w { [[UPK1_LO]].d }, [[MASK_LO]], {{\[}}[[PTRS_LO]].d]
; VBITS_EQ_256-NEXT: ldr q[[CMP_HI:[0-9]+]], [sp, #48]
; VBITS_EQ_256-NEXT: ldr q[[VAL_HI:[0-9]+]], [sp, #16]
; VBITS_EQ_256-NEXT: cmpne [[MASK_HI:p[0-9]+]].s, [[PG2]]/z, z[[CMP_HI]].s, #0
; VBITS_EQ_256-NEXT: uunpklo [[UPK1_HI:z[0-9]+]].d, z[[VAL_HI]].s
; VBITS_EQ_256-NEXT: st1w { [[UPK1_HI]].d }, [[MASK_HI]], {{\[}}[[PTRS_HI]].d]
; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].s, vl8
; VBITS_GE_512-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].d, vl8
; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
; VBITS_GE_512-NEXT: cmpeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0
; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s
; VBITS_GE_512-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; VBITS_GE_512-NEXT: ret
%vals = load <8 x i32>, <8 x i32>* %a
%ptrs = load <8 x i32*>, <8 x i32*>* %b
%mask = icmp eq <8 x i32> %vals, zeroinitializer
call void @llvm.masked.scatter.v8i32(<8 x i32> %vals, <8 x i32*> %ptrs, i32 8, <8 x i1> %mask)
ret void
}
define void @masked_scatter_v16i32(<16 x i32>* %a, <16 x i32*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v16i32:
; VBITS_GE_1024: ptrue [[PG0:p[0-9]+]].s, vl16
; VBITS_GE_1024-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16
; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
; VBITS_GE_1024-NEXT: cmpeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0
; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s
; VBITS_GE_1024-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; VBITS_GE_1024-NEXT: ret
%vals = load <16 x i32>, <16 x i32>* %a
%ptrs = load <16 x i32*>, <16 x i32*>* %b
%mask = icmp eq <16 x i32> %vals, zeroinitializer
call void @llvm.masked.scatter.v16i32(<16 x i32> %vals, <16 x i32*> %ptrs, i32 8, <16 x i1> %mask)
ret void
}
define void @masked_scatter_v32i32(<32 x i32>* %a, <32 x i32*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v32i32:
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].s, vl32
; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
; VBITS_GE_2048-NEXT: cmpeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s
; VBITS_GE_2048-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; VBITS_GE_2048-NEXT: ret
%vals = load <32 x i32>, <32 x i32>* %a
%ptrs = load <32 x i32*>, <32 x i32*>* %b
%mask = icmp eq <32 x i32> %vals, zeroinitializer
call void @llvm.masked.scatter.v32i32(<32 x i32> %vals, <32 x i32*> %ptrs, i32 8, <32 x i1> %mask)
ret void
}
;
; ST1D
;
; Scalarize 1 x i64 scatters
define void @masked_scatter_v1i64(<1 x i64>* %a, <1 x i64*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v1i64:
; CHECK-NOT: ptrue
%vals = load <1 x i64>, <1 x i64>* %a
%ptrs = load <1 x i64*>, <1 x i64*>* %b
%mask = icmp eq <1 x i64> %vals, zeroinitializer
call void @llvm.masked.scatter.v1i64(<1 x i64> %vals, <1 x i64*> %ptrs, i32 8, <1 x i1> %mask)
ret void
}
define void @masked_scatter_v2i64(<2 x i64>* %a, <2 x i64*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v2i64:
; CHECK: ldr q[[VALS:[0-9]+]], [x0]
; CHECK-NEXT: ldr q[[PTRS:[0-9]+]], [x1]
; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl2
; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].2d, v[[VALS]].2d, #0
; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG0]]/z, z[[CMP]].d, #0
; CHECK-NEXT: st1d { z[[VALS]].d }, [[MASK]], [z[[PTRS]].d]
; CHECK-NEXT: ret
%vals = load <2 x i64>, <2 x i64>* %a
%ptrs = load <2 x i64*>, <2 x i64*>* %b
%mask = icmp eq <2 x i64> %vals, zeroinitializer
call void @llvm.masked.scatter.v2i64(<2 x i64> %vals, <2 x i64*> %ptrs, i32 8, <2 x i1> %mask)
ret void
}
define void @masked_scatter_v4i64(<4 x i64>* %a, <4 x i64*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v4i64:
; CHECK: ptrue [[PG0:p[0-9]+]].d, vl4
; CHECK-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
; CHECK-NEXT: cmpeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0
; CHECK-NEXT: st1d { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; CHECK-NEXT: ret
%vals = load <4 x i64>, <4 x i64>* %a
%ptrs = load <4 x i64*>, <4 x i64*>* %b
%mask = icmp eq <4 x i64> %vals, zeroinitializer
call void @llvm.masked.scatter.v4i64(<4 x i64> %vals, <4 x i64*> %ptrs, i32 8, <4 x i1> %mask)
ret void
}
define void @masked_scatter_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v8i64:
; VBITS_EQ_256: ptrue [[PG0:p[0-9]+]].d, vl4
; VBITS_EQ_256-NEXT: add x8, x0, #32
; VBITS_EQ_256-NEXT: add x9, x1, #32
; VBITS_EQ_256-NEXT: ld1d { [[VALS_LO:z[0-9]+]].d }, [[PG0]]/z, [x0]
; VBITS_EQ_256-NEXT: ld1d { [[VALS_HI:z[0-9]+]].d }, [[PG0]]/z, [x8]
; VBITS_EQ_256-NEXT: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG0]]/z, [x9]
; VBITS_EQ_256-NEXT: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG0]]/z, [x1]
; VBITS_EQ_256-NEXT: cmpeq [[MASK_HI:p[0-9]+]].d, [[PG0]]/z, [[VALS_HI]].d, #0
; VBITS_EQ_256-NEXT: cmpeq [[MASK_LO:p[0-9]+]].d, [[PG0]]/z, [[VALS_LO]].d, #0
; VBITS_EQ_256-NEXT: st1d { [[VALS_LO]].d }, [[MASK_LO]], {{\[}}[[PTRS_LO]].d]
; VBITS_EQ_256-NEXT: st1d { [[VALS_HI]].d }, [[MASK_HI]], {{\[}}[[PTRS_HI]].d]
; VBITS_EQ_256-NEXT: ret
; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].d, vl8
; VBITS_GE_512-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
; VBITS_GE_512-NEXT: cmpeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0
; VBITS_GE_512-NEXT: st1d { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; VBITS_GE_512-NEXT: ret
%vals = load <8 x i64>, <8 x i64>* %a
%ptrs = load <8 x i64*>, <8 x i64*>* %b
%mask = icmp eq <8 x i64> %vals, zeroinitializer
call void @llvm.masked.scatter.v8i64(<8 x i64> %vals, <8 x i64*> %ptrs, i32 8, <8 x i1> %mask)
ret void
}
define void @masked_scatter_v16i64(<16 x i64>* %a, <16 x i64*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v16i64:
; VBITS_GE_1024: ptrue [[PG0:p[0-9]+]].d, vl16
; VBITS_GE_1024-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
; VBITS_GE_1024-NEXT: cmpeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0
; VBITS_GE_1024-NEXT: st1d { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; VBITS_GE_1024-NEXT: ret
%vals = load <16 x i64>, <16 x i64>* %a
%ptrs = load <16 x i64*>, <16 x i64*>* %b
%mask = icmp eq <16 x i64> %vals, zeroinitializer
call void @llvm.masked.scatter.v16i64(<16 x i64> %vals, <16 x i64*> %ptrs, i32 8, <16 x i1> %mask)
ret void
}
define void @masked_scatter_v32i64(<32 x i64>* %a, <32 x i64*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v32i64:
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].d, vl32
; VBITS_GE_2048-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
; VBITS_GE_2048-NEXT: cmpeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, #0
; VBITS_GE_2048-NEXT: st1d { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; VBITS_GE_2048-NEXT: ret
%vals = load <32 x i64>, <32 x i64>* %a
%ptrs = load <32 x i64*>, <32 x i64*>* %b
%mask = icmp eq <32 x i64> %vals, zeroinitializer
call void @llvm.masked.scatter.v32i64(<32 x i64> %vals, <32 x i64*> %ptrs, i32 8, <32 x i1> %mask)
ret void
}
;
; ST1H (float)
;
define void @masked_scatter_v2f16(<2 x half>* %a, <2 x half*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v2f16:
; CHECK: ldr s[[VALS:[0-9]+]], [x0]
; CHECK-NEXT: movi d2, #0000000000000000
; CHECK-NEXT: ldr q[[PTRS:[0-9]+]], [x1]
; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].h, vl4
; CHECK-NEXT: fcmeq v[[CMP:[0-9]+]].4h, v[[VALS]].4h, #0.0
; CHECK-NEXT: umov w8, v[[CMP]].h[0]
; CHECK-NEXT: umov w9, v[[CMP]].h[1]
; CHECK-NEXT: fmov s[[CMP]], w8
; CHECK-NEXT: mov v[[CMP]].s[1], w9
; CHECK-NEXT: shl v[[CMP]].2s, v[[CMP]].2s, #16
; CHECK-NEXT: sshr v[[CMP]].2s, v[[CMP]].2s, #16
; CHECK-NEXT: fmov w9, s[[CMP]]
; CHECK-NEXT: mov w8, v[[CMP]].s[1]
; CHECK-NEXT: mov v[[NCMP:[0-9]+]].h[0], w9
; CHECK-NEXT: mov v[[NCMP]].h[1], w8
; CHECK-NEXT: shl v[[NCMP]].4h, v[[NCMP]].4h, #15
; CHECK-NEXT: uunpklo [[UPK1]].s, z[[VALS]].h
; CHECK-NEXT: sshr v[[NCMP]].4h, v[[NCMP]].4h, #15
; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG0]]/z, z[[NCMP]].h, #0
; CHECK-NEXT: uunpklo [[UPK2]].d, [[UPK1]].s
; CHECK-NEXT: st1h { [[UPK2]].d }, [[MASK]], [z[[PTRS]].d]
; CHECK-NEXT: ret
%vals = load <2 x half>, <2 x half>* %a
%ptrs = load <2 x half*>, <2 x half*>* %b
%mask = fcmp oeq <2 x half> %vals, zeroinitializer
call void @llvm.masked.scatter.v2f16(<2 x half> %vals, <2 x half*> %ptrs, i32 8, <2 x i1> %mask)
ret void
}
define void @masked_scatter_v4f16(<4 x half>* %a, <4 x half*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v4f16:
; CHECK: ldr d[[VALS:[0-9]+]], [x0]
; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4
; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl4
; CHECK-NEXT: fcmeq v[[CMP:[0-9]+]].4h, v[[VALS]].4h, #0
; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[VALS]].h
; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG1]]/z, z[[CMP]].h, #0
; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
; CHECK-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; CHECK-NEXT: ret
%vals = load <4 x half>, <4 x half>* %a
%ptrs = load <4 x half*>, <4 x half*>* %b
%mask = fcmp oeq <4 x half> %vals, zeroinitializer
call void @llvm.masked.scatter.v4f16(<4 x half> %vals, <4 x half*> %ptrs, i32 8, <4 x i1> %mask)
ret void
}
define void @masked_scatter_v8f16(<8 x half>* %a, <8 x half*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v8f16:
; VBITS_GE_512: ldr q[[VALS:[0-9]+]], [x0]
; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].d, vl8
; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].h, vl8
; VBITS_GE_512-NEXT: fcmeq v[[CMP:[0-9]+]].8h, v[[VALS]].8h, #0
; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[VALS]].h
; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG1]]/z, z[[CMP]].h, #0
; VBITS_GE_512-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
; VBITS_GE_512-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; VBITS_GE_512-NEXT: ret
%vals = load <8 x half>, <8 x half>* %a
%ptrs = load <8 x half*>, <8 x half*>* %b
%mask = fcmp oeq <8 x half> %vals, zeroinitializer
call void @llvm.masked.scatter.v8f16(<8 x half> %vals, <8 x half*> %ptrs, i32 8, <8 x i1> %mask)
ret void
}
define void @masked_scatter_v16f16(<16 x half>* %a, <16 x half*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v16f16:
; VBITS_GE_1024: ptrue [[PG0:p[0-9]+]].h, vl16
; VBITS_GE_1024-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16
; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
; VBITS_GE_1024-NEXT: mov [[ZERO:z[0-9]+]].h, #0
; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]]
; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[VALS]].h
; VBITS_GE_1024-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
; VBITS_GE_1024-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; VBITS_GE_1024-NEXT: ret
%vals = load <16 x half>, <16 x half>* %a
%ptrs = load <16 x half*>, <16 x half*>* %b
%mask = fcmp oeq <16 x half> %vals, zeroinitializer
call void @llvm.masked.scatter.v16f16(<16 x half> %vals, <16 x half*> %ptrs, i32 8, <16 x i1> %mask)
ret void
}
define void @masked_scatter_v32f16(<32 x half>* %a, <32 x half*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v32f16:
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0
; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]]
; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[VALS]].h
; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s
; VBITS_GE_2048-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; VBITS_GE_2048-NEXT: ret
%vals = load <32 x half>, <32 x half>* %a
%ptrs = load <32 x half*>, <32 x half*>* %b
%mask = fcmp oeq <32 x half> %vals, zeroinitializer
call void @llvm.masked.scatter.v32f16(<32 x half> %vals, <32 x half*> %ptrs, i32 8, <32 x i1> %mask)
ret void
}
;
; ST1W (float)
;
define void @masked_scatter_v2f32(<2 x float>* %a, <2 x float*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v2f32:
; CHECK: ldr d[[VALS:[0-9]+]], [x0]
; CHECK-NEXT: ldr q[[PTRS:[0-9]+]], [x1]
; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].s, vl2
; CHECK-NEXT: fcmeq v[[CMP:[0-9]+]].2s, v[[VALS]].2s, #0
; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG0]]/z, z[[CMP]].s, #0
; CHECK-NEXT: ushll v[[SHL:[0-9]+]].2d, v[[VALS]].2s, #0
; CHECK-NEXT: st1w { z[[SHL]].d }, [[MASK]], [z[[PTRS]].d]
; CHECK-NEXT: ret
%vals = load <2 x float>, <2 x float>* %a
%ptrs = load <2 x float*>, <2 x float*>* %b
%mask = fcmp oeq <2 x float> %vals, zeroinitializer
call void @llvm.masked.scatter.v2f32(<2 x float> %vals, <2 x float*> %ptrs, i32 8, <2 x i1> %mask)
ret void
}
define void @masked_scatter_v4f32(<4 x float>* %a, <4 x float*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v4f32:
; CHECK: ldr q[[VALS:[0-9]+]], [x0]
; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4
; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4
; CHECK-NEXT: fcmeq v[[CMP:[0-9]+]].4s, v[[VALS]].4s, #0
; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, z[[CMP]].s, #0
; CHECK-NEXT: uunpklo [[UPK:z[0-9]+]].d, z[[VALS]].s
; CHECK-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; CHECK-NEXT: ret
%vals = load <4 x float>, <4 x float>* %a
%ptrs = load <4 x float*>, <4 x float*>* %b
%mask = fcmp oeq <4 x float> %vals, zeroinitializer
call void @llvm.masked.scatter.v4f32(<4 x float> %vals, <4 x float*> %ptrs, i32 8, <4 x i1> %mask)
ret void
}
define void @masked_scatter_v8f32(<8 x float>* %a, <8 x float*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v8f32:
; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].s, vl8
; VBITS_GE_512-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].d, vl8
; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
; VBITS_GE_512-NEXT: mov [[ZERO:z[0-9]+]].s, #0
; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]]
; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s
; VBITS_GE_512-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; VBITS_GE_512-NEXT: ret
%vals = load <8 x float>, <8 x float>* %a
%ptrs = load <8 x float*>, <8 x float*>* %b
%mask = fcmp oeq <8 x float> %vals, zeroinitializer
call void @llvm.masked.scatter.v8f32(<8 x float> %vals, <8 x float*> %ptrs, i32 8, <8 x i1> %mask)
ret void
}
define void @masked_scatter_v16f32(<16 x float>* %a, <16 x float*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v16f32:
; VBITS_GE_1024: ptrue [[PG0:p[0-9]+]].s, vl16
; VBITS_GE_1024-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16
; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
; VBITS_GE_1024-NEXT: mov [[ZERO:z[0-9]+]].s, #0
; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]]
; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s
; VBITS_GE_1024-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; VBITS_GE_1024-NEXT: ret
%vals = load <16 x float>, <16 x float>* %a
%ptrs = load <16 x float*>, <16 x float*>* %b
%mask = fcmp oeq <16 x float> %vals, zeroinitializer
call void @llvm.masked.scatter.v16f32(<16 x float> %vals, <16 x float*> %ptrs, i32 8, <16 x i1> %mask)
ret void
}
define void @masked_scatter_v32f32(<32 x float>* %a, <32 x float*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v32f32:
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].s, vl32
; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0
; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]]
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s
; VBITS_GE_2048-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; VBITS_GE_2048-NEXT: ret
%vals = load <32 x float>, <32 x float>* %a
%ptrs = load <32 x float*>, <32 x float*>* %b
%mask = fcmp oeq <32 x float> %vals, zeroinitializer
call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x float*> %ptrs, i32 8, <32 x i1> %mask)
ret void
}
;
; ST1D (float)
;
; Scalarize 1 x double scatters
define void @masked_scatter_v1f64(<1 x double>* %a, <1 x double*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v1f64:
; CHECK-NOT: ptrue
%vals = load <1 x double>, <1 x double>* %a
%ptrs = load <1 x double*>, <1 x double*>* %b
%mask = fcmp oeq <1 x double> %vals, zeroinitializer
call void @llvm.masked.scatter.v1f64(<1 x double> %vals, <1 x double*> %ptrs, i32 8, <1 x i1> %mask)
ret void
}
define void @masked_scatter_v2f64(<2 x double>* %a, <2 x double*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v2f64:
; CHECK: ldr q[[VALS:[0-9]+]], [x0]
; CHECK-NEXT: ldr q[[PTRS:[0-9]+]], [x1]
; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl2
; CHECK-NEXT: fcmeq v[[CMP:[0-9]+]].2d, v[[VALS]].2d, #0
; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG0]]/z, z[[CMP]].d, #0
; CHECK-NEXT: st1d { z[[VALS]].d }, [[MASK]], [z[[PTRS]].d]
; CHECK-NEXT: ret
%vals = load <2 x double>, <2 x double>* %a
%ptrs = load <2 x double*>, <2 x double*>* %b
%mask = fcmp oeq <2 x double> %vals, zeroinitializer
call void @llvm.masked.scatter.v2f64(<2 x double> %vals, <2 x double*> %ptrs, i32 8, <2 x i1> %mask)
ret void
}
define void @masked_scatter_v4f64(<4 x double>* %a, <4 x double*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v4f64:
; CHECK: ptrue [[PG0:p[0-9]+]].d, vl4
; CHECK-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
; CHECK-NEXT: mov [[ZERO:z[0-9]+]].d, #0
; CHECK-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, [[ZERO]].d
; CHECK-NEXT: st1d { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; CHECK-NEXT: ret
%vals = load <4 x double>, <4 x double>* %a
%ptrs = load <4 x double*>, <4 x double*>* %b
%mask = fcmp oeq <4 x double> %vals, zeroinitializer
call void @llvm.masked.scatter.v4f64(<4 x double> %vals, <4 x double*> %ptrs, i32 8, <4 x i1> %mask)
ret void
}
define void @masked_scatter_v8f64(<8 x double>* %a, <8 x double*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v8f64:
; VBITS_GE_512: ptrue [[PG0:p[0-9]+]].d, vl8
; VBITS_GE_512-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
; VBITS_GE_512-NEXT: mov [[ZERO:z[0-9]+]].d, #0
; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, [[ZERO]]
; VBITS_GE_512-NEXT: st1d { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; VBITS_GE_512-NEXT: ret
%vals = load <8 x double>, <8 x double>* %a
%ptrs = load <8 x double*>, <8 x double*>* %b
%mask = fcmp oeq <8 x double> %vals, zeroinitializer
call void @llvm.masked.scatter.v8f64(<8 x double> %vals, <8 x double*> %ptrs, i32 8, <8 x i1> %mask)
ret void
}
define void @masked_scatter_v16f64(<16 x double>* %a, <16 x double*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v16f64:
; VBITS_GE_1024: ptrue [[PG0:p[0-9]+]].d, vl16
; VBITS_GE_1024-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
; VBITS_GE_1024-NEXT: mov [[ZERO:z[0-9]+]].d, #0
; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, [[ZERO]]
; VBITS_GE_1024-NEXT: st1d { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; VBITS_GE_1024-NEXT: ret
%vals = load <16 x double>, <16 x double>* %a
%ptrs = load <16 x double*>, <16 x double*>* %b
%mask = fcmp oeq <16 x double> %vals, zeroinitializer
call void @llvm.masked.scatter.v16f64(<16 x double> %vals, <16 x double*> %ptrs, i32 8, <16 x i1> %mask)
ret void
}
define void @masked_scatter_v32f64(<32 x double>* %a, <32 x double*>* %b) #0 {
; CHECK-LABEL: masked_scatter_v32f64:
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].d, vl32
; VBITS_GE_2048-NEXT: ld1d { [[VALS:z[0-9]+]].d }, [[PG0]]/z, [x0]
; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1]
; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].d, #0
; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].d, [[PG0]]/z, [[VALS]].d, [[ZERO]]
; VBITS_GE_2048-NEXT: st1d { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS]].d]
; VBITS_GE_2048-NEXT: ret
%vals = load <32 x double>, <32 x double>* %a
%ptrs = load <32 x double*>, <32 x double*>* %b
%mask = fcmp oeq <32 x double> %vals, zeroinitializer
call void @llvm.masked.scatter.v32f64(<32 x double> %vals, <32 x double*> %ptrs, i32 8, <32 x i1> %mask)
ret void
}
; The above tests test the types, the below tests check that the addressing
; modes still function
define void @masked_scatter_32b_scaled_sext(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 {
; CHECK-LABEL: masked_scatter_32b_scaled_sext:
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0
; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]]
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[VALS]].h
; VBITS_GE_2048-NEXT: st1h { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw #1]
; VBITS_GE_2048-NEXT: ret
%vals = load <32 x half>, <32 x half>* %a
%idxs = load <32 x i32>, <32 x i32>* %b
%ext = sext <32 x i32> %idxs to <32 x i64>
%ptrs = getelementptr half, half* %base, <32 x i64> %ext
%mask = fcmp oeq <32 x half> %vals, zeroinitializer
call void @llvm.masked.scatter.v32f16(<32 x half> %vals, <32 x half*> %ptrs, i32 8, <32 x i1> %mask)
ret void
}
define void @masked_scatter_32b_scaled_zext(<32 x half>* %a, <32 x i32>* %b, half* %base) #0 {
; CHECK-LABEL: masked_scatter_32b_scaled_zext:
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0
; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]]
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[VALS]].h
; VBITS_GE_2048-NEXT: st1h { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, uxtw #1]
; VBITS_GE_2048-NEXT: ret
%vals = load <32 x half>, <32 x half>* %a
%idxs = load <32 x i32>, <32 x i32>* %b
%ext = zext <32 x i32> %idxs to <32 x i64>
%ptrs = getelementptr half, half* %base, <32 x i64> %ext
%mask = fcmp oeq <32 x half> %vals, zeroinitializer
call void @llvm.masked.scatter.v32f16(<32 x half> %vals, <32 x half*> %ptrs, i32 8, <32 x i1> %mask)
ret void
}
define void @masked_scatter_32b_unscaled_sext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 {
; CHECK-LABEL: masked_scatter_32b_unscaled_sext:
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0
; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]]
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[VALS]].h
; VBITS_GE_2048-NEXT: st1h { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw]
; VBITS_GE_2048-NEXT: ret
%vals = load <32 x half>, <32 x half>* %a
%idxs = load <32 x i32>, <32 x i32>* %b
%ext = sext <32 x i32> %idxs to <32 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <32 x i64> %ext
%ptrs = bitcast <32 x i8*> %byte_ptrs to <32 x half*>
%mask = fcmp oeq <32 x half> %vals, zeroinitializer
call void @llvm.masked.scatter.v32f16(<32 x half> %vals, <32 x half*> %ptrs, i32 8, <32 x i1> %mask)
ret void
}
define void @masked_scatter_32b_unscaled_zext(<32 x half>* %a, <32 x i32>* %b, i8* %base) #0 {
; CHECK-LABEL: masked_scatter_32b_unscaled_zext:
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].h, vl32
; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0]
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32
; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1]
; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].h, #0
; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, [[ZERO]]
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[VALS]].h
; VBITS_GE_2048-NEXT: st1h { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, uxtw]
; VBITS_GE_2048-NEXT: ret
%vals = load <32 x half>, <32 x half>* %a
%idxs = load <32 x i32>, <32 x i32>* %b
%ext = zext <32 x i32> %idxs to <32 x i64>
%byte_ptrs = getelementptr i8, i8* %base, <32 x i64> %ext
%ptrs = bitcast <32 x i8*> %byte_ptrs to <32 x half*>
%mask = fcmp oeq <32 x half> %vals, zeroinitializer
call void @llvm.masked.scatter.v32f16(<32 x half> %vals, <32 x half*> %ptrs, i32 8, <32 x i1> %mask)
ret void
}
define void @masked_scatter_64b_scaled(<32 x float>* %a, <32 x i64>* %b, float* %base) #0 {
; CHECK-LABEL: masked_scatter_64b_scaled:
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].s, vl32
; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0
; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]]
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s
; VBITS_GE_2048-NEXT: st1w { [[VALS]].d }, [[MASK]], [x2, [[PTRS]].d, lsl #2]
; VBITS_GE_2048-NEXT: ret
%vals = load <32 x float>, <32 x float>* %a
%idxs = load <32 x i64>, <32 x i64>* %b
%ptrs = getelementptr float, float* %base, <32 x i64> %idxs
%mask = fcmp oeq <32 x float> %vals, zeroinitializer
call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x float*> %ptrs, i32 8, <32 x i1> %mask)
ret void
}
define void @masked_scatter_64b_unscaled(<32 x float>* %a, <32 x i64>* %b, i8* %base) #0 {
; CHECK-LABEL: masked_scatter_64b_unscaled:
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].s, vl32
; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0
; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]]
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s
; VBITS_GE_2048-NEXT: st1w { [[VALS]].d }, [[MASK]], [x2, [[PTRS]].d]
; VBITS_GE_2048-NEXT: ret
%vals = load <32 x float>, <32 x float>* %a
%idxs = load <32 x i64>, <32 x i64>* %b
%byte_ptrs = getelementptr i8, i8* %base, <32 x i64> %idxs
%ptrs = bitcast <32 x i8*> %byte_ptrs to <32 x float*>
%mask = fcmp oeq <32 x float> %vals, zeroinitializer
call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x float*> %ptrs, i32 8, <32 x i1> %mask)
ret void
}
; FIXME: This case does not yet codegen well due to deficiencies in opcode selection
define void @masked_scatter_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %off) #0 {
; CHECK-LABEL: masked_scatter_vec_plus_reg:
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].s, vl32
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
; VBITS_GE_2048-NEXT: mov [[OFF:z[0-9]+]].d, x2
; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0
; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]]
; VBITS_GE_2048-NEXT: add [[PTRS_ADD:z[0-9]+]].d, [[PG1]]/m, [[PTRS]].d, [[OFF]].d
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s
; VBITS_GE_2048-NEXT: st1w { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS_ADD]].d]
; VBITS_GE_2048-NEXT: ret
%vals = load <32 x float>, <32 x float>* %a
%bases = load <32 x i8*>, <32 x i8*>* %b
%byte_ptrs = getelementptr i8, <32 x i8*> %bases, i64 %off
%ptrs = bitcast <32 x i8*> %byte_ptrs to <32 x float*>
%mask = fcmp oeq <32 x float> %vals, zeroinitializer
call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x float*> %ptrs, i32 8, <32 x i1> %mask)
ret void
}
; FIXME: This case does not yet codegen well due to deficiencies in opcode selection
define void @masked_scatter_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 {
; CHECK-LABEL: masked_scatter_vec_plus_imm:
; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].s, vl32
; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32
; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0]
; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1]
; VBITS_GE_2048-NEXT: mov [[OFF:z[0-9]+]].d, #4
; VBITS_GE_2048-NEXT: mov [[ZERO:z[0-9]+]].s, #0
; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, [[ZERO]]
; VBITS_GE_2048-NEXT: add [[PTRS_ADD:z[0-9]+]].d, [[PG1]]/m, [[PTRS]].d, [[OFF]].d
; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s
; VBITS_GE_2048-NEXT: st1w { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS_ADD]].d]
; VBITS_GE_2048-NEXT: ret
%vals = load <32 x float>, <32 x float>* %a
%bases = load <32 x i8*>, <32 x i8*>* %b
%byte_ptrs = getelementptr i8, <32 x i8*> %bases, i64 4
%ptrs = bitcast <32 x i8*> %byte_ptrs to <32 x float*>
%mask = fcmp oeq <32 x float> %vals, zeroinitializer
call void @llvm.masked.scatter.v32f32(<32 x float> %vals, <32 x float*> %ptrs, i32 8, <32 x i1> %mask)
ret void
}
declare void @llvm.masked.scatter.v2i8(<2 x i8>, <2 x i8*>, i32, <2 x i1>)
declare void @llvm.masked.scatter.v4i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>)
declare void @llvm.masked.scatter.v8i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>)
declare void @llvm.masked.scatter.v16i8(<16 x i8>, <16 x i8*>, i32, <16 x i1>)
declare void @llvm.masked.scatter.v32i8(<32 x i8>, <32 x i8*>, i32, <32 x i1>)
declare void @llvm.masked.scatter.v2i16(<2 x i16>, <2 x i16*>, i32, <2 x i1>)
declare void @llvm.masked.scatter.v4i16(<4 x i16>, <4 x i16*>, i32, <4 x i1>)
declare void @llvm.masked.scatter.v8i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>)
declare void @llvm.masked.scatter.v16i16(<16 x i16>, <16 x i16*>, i32, <16 x i1>)
declare void @llvm.masked.scatter.v32i16(<32 x i16>, <32 x i16*>, i32, <32 x i1>)
declare void @llvm.masked.scatter.v2i32(<2 x i32>, <2 x i32*>, i32, <2 x i1>)
declare void @llvm.masked.scatter.v4i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>)
declare void @llvm.masked.scatter.v8i32(<8 x i32>, <8 x i32*>, i32, <8 x i1>)
declare void @llvm.masked.scatter.v16i32(<16 x i32>, <16 x i32*>, i32, <16 x i1>)
declare void @llvm.masked.scatter.v32i32(<32 x i32>, <32 x i32*>, i32, <32 x i1>)
declare void @llvm.masked.scatter.v1i64(<1 x i64>, <1 x i64*>, i32, <1 x i1>)
declare void @llvm.masked.scatter.v2i64(<2 x i64>, <2 x i64*>, i32, <2 x i1>)
declare void @llvm.masked.scatter.v4i64(<4 x i64>, <4 x i64*>, i32, <4 x i1>)
declare void @llvm.masked.scatter.v8i64(<8 x i64>, <8 x i64*>, i32, <8 x i1>)
declare void @llvm.masked.scatter.v16i64(<16 x i64>, <16 x i64*>, i32, <16 x i1>)
declare void @llvm.masked.scatter.v32i64(<32 x i64>, <32 x i64*>, i32, <32 x i1>)
declare void @llvm.masked.scatter.v2f16(<2 x half>, <2 x half*>, i32, <2 x i1>)
declare void @llvm.masked.scatter.v4f16(<4 x half>, <4 x half*>, i32, <4 x i1>)
declare void @llvm.masked.scatter.v8f16(<8 x half>, <8 x half*>, i32, <8 x i1>)
declare void @llvm.masked.scatter.v16f16(<16 x half>, <16 x half*>, i32, <16 x i1>)
declare void @llvm.masked.scatter.v32f16(<32 x half>, <32 x half*>, i32, <32 x i1>)
declare void @llvm.masked.scatter.v2f32(<2 x float>, <2 x float*>, i32, <2 x i1>)
declare void @llvm.masked.scatter.v4f32(<4 x float>, <4 x float*>, i32, <4 x i1>)
declare void @llvm.masked.scatter.v8f32(<8 x float>, <8 x float*>, i32, <8 x i1>)
declare void @llvm.masked.scatter.v16f32(<16 x float>, <16 x float*>, i32, <16 x i1>)
declare void @llvm.masked.scatter.v32f32(<32 x float>, <32 x float*>, i32, <32 x i1>)
declare void @llvm.masked.scatter.v1f64(<1 x double>, <1 x double*>, i32, <1 x i1>)
declare void @llvm.masked.scatter.v2f64(<2 x double>, <2 x double*>, i32, <2 x i1>)
declare void @llvm.masked.scatter.v4f64(<4 x double>, <4 x double*>, i32, <4 x i1>)
declare void @llvm.masked.scatter.v8f64(<8 x double>, <8 x double*>, i32, <8 x i1>)
declare void @llvm.masked.scatter.v16f64(<16 x double>, <16 x double*>, i32, <16 x i1>)
declare void @llvm.masked.scatter.v32f64(<32 x double>, <32 x double*>, i32, <32 x i1>)
attributes #0 = { "target-features"="+sve" }

View File

@ -106,6 +106,27 @@ define <vscale x 2 x i64> @masked_sgather_nxv2i32(<vscale x 2 x i32*> %ptrs, <vs
ret <vscale x 2 x i64> %vals.sext
}
define <vscale x 2 x i64> @masked_gather_passthru(<vscale x 2 x i32*> %ptrs, <vscale x 2 x i1> %mask, <vscale x 2 x i32> %passthru) {
; CHECK-LABEL: masked_gather_passthru:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1sw { z0.d }, p0/z, [z0.d]
; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d
; CHECK-NEXT: ret
%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> %passthru)
%vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
ret <vscale x 2 x i64> %vals.sext
}
define <vscale x 2 x i64> @masked_gather_passthru_0(<vscale x 2 x i32*> %ptrs, <vscale x 2 x i1> %mask) {
; CHECK-LABEL: masked_gather_passthru_0:
; CHECK: // %bb.0:
; CHECK-NEXT: ld1sw { z0.d }, p0/z, [z0.d]
; CHECK-NEXT: ret
%vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> zeroinitializer)
%vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
ret <vscale x 2 x i64> %vals.sext
}
declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)