[SVE][CodeGen] Vector + immediate addressing mode for masked gather/scatter

This patch extends LowerMGATHER/MSCATTER to make use of the vector + reg/immediate addressing modes for scalable masked gathers & scatters. selectGatherScatterAddrMode checks if the base pointer is null, in which case we can swap the base pointer and the index, e.g. getelementptr nullptr, <vscale x N x T> (splat(%offset)) + %indices) -> getelementptr %offset, <vscale x N x T> %indices Reviewed By: david-arm Differential Revision: https://reviews.llvm.org/D93132
2025-01-31 20:51:52 +01:00 · 2020-12-18 11:04:41 +00:00 · 2020-12-18 11:04:41 +00:00 · a9d5f92f0d
commit a9d5f92f0d
parent 28cf5e1f81
8 changed files with 847 additions and 10 deletions
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -3812,6 +3812,8 @@ unsigned getSignExtendedGatherOpcode(unsigned Opcode) {
    return Opcode;
  case AArch64ISD::GLD1_MERGE_ZERO:
    return AArch64ISD::GLD1S_MERGE_ZERO;
+  case AArch64ISD::GLD1_IMM_MERGE_ZERO:
+    return AArch64ISD::GLD1S_IMM_MERGE_ZERO;
  case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
    return AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
  case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
@ -3843,6 +3845,60 @@ bool getGatherScatterIndexIsExtended(SDValue Index) {
  return false;
 }

+// If the base pointer of a masked gather or scatter is null, we
+// may be able to swap BasePtr & Index and use the vector + register
+// or vector + immediate addressing mode, e.g.
+// VECTOR + REGISTER:
+//    getelementptr nullptr, <vscale x N x T> (splat(%offset)) + %indices)
+// -> getelementptr %offset, <vscale x N x T> %indices
+// VECTOR + IMMEDIATE:
+//    getelementptr nullptr, <vscale x N x T> (splat(#x)) + %indices)
+// -> getelementptr #x, <vscale x N x T> %indices
+void selectGatherScatterAddrMode(SDValue &BasePtr, SDValue &Index, EVT MemVT,
+                                 unsigned &Opcode, bool IsGather,
+                                 SelectionDAG &DAG) {
+  if (!isNullConstant(BasePtr))
+    return;
+
+  ConstantSDNode *Offset = nullptr;
+  if (Index.getOpcode() == ISD::ADD)
+    if (auto SplatVal = DAG.getSplatValue(Index.getOperand(1))) {
+      if (isa<ConstantSDNode>(SplatVal))
+        Offset = cast<ConstantSDNode>(SplatVal);
+      else {
+        BasePtr = SplatVal;
+        Index = Index->getOperand(0);
+        return;
+      }
+    }
+
+  unsigned NewOp =
+      IsGather ? AArch64ISD::GLD1_IMM_MERGE_ZERO : AArch64ISD::SST1_IMM_PRED;
+
+  if (!Offset) {
+    std::swap(BasePtr, Index);
+    Opcode = NewOp;
+    return;
+  }
+
+  uint64_t OffsetVal = Offset->getZExtValue();
+  unsigned ScalarSizeInBytes = MemVT.getScalarSizeInBits() / 8;
+  auto ConstOffset = DAG.getConstant(OffsetVal, SDLoc(Index), MVT::i64);
+
+  if (OffsetVal % ScalarSizeInBytes || OffsetVal / ScalarSizeInBytes > 31) {
+    // Index is out of range for the immediate addressing mode
+    BasePtr = ConstOffset;
+    Index = Index->getOperand(0);
+    return;
+  }
+
+  // Immediate is in range
+  Opcode = NewOp;
+  BasePtr = Index->getOperand(0);
+  Index = ConstOffset;
+  return;
+}
+
 SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
                                            SelectionDAG &DAG) const {
  SDLoc DL(Op);
@ -3892,6 +3948,9 @@ SDValue AArch64TargetLowering::LowerMGATHER(SDValue Op,
    Index = Index.getOperand(0);

  unsigned Opcode = getGatherVecOpcode(IsScaled, IsSigned, IdxNeedsExtend);
+  selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode,
+                              /*isGather=*/true, DAG);
+
  if (ResNeedsSignExtend)
    Opcode = getSignExtendedGatherOpcode(Opcode);

@ -3944,9 +4003,12 @@ SDValue AArch64TargetLowering::LowerMSCATTER(SDValue Op,
  if (getGatherScatterIndexIsExtended(Index))
    Index = Index.getOperand(0);

+  unsigned Opcode = getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend);
+  selectGatherScatterAddrMode(BasePtr, Index, MemVT, Opcode,
+                              /*isGather=*/false, DAG);
+
  SDValue Ops[] = {Chain, StoreVal, Mask, BasePtr, Index, InputVT};
-  return DAG.getNode(getScatterVecOpcode(IsScaled, IsSigned, NeedsExtend), DL,
-                     VTs, Ops);
+  return DAG.getNode(Opcode, DL, VTs, Ops);
 }

 // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
--- a/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
+++ b/test/CodeGen/AArch64/sve-masked-gather-legalize.ll
@ -44,12 +44,29 @@ define <vscale x 2 x i64> @masked_sgather_zext(i8* %base, <vscale x 2 x i64> %of

 ; Tests that exercise various type legalisation scenarios for ISD::MGATHER.

+; Code generate load of an illegal datatype via promotion.
+define <vscale x 2 x i8> @masked_gather_nxv2i8(<vscale x 2 x i8*> %ptrs, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i8:
+; CHECK: ld1sb { z0.d }, p0/z, [z0.d]
+; CHECK: ret
+  %data = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  ret <vscale x 2 x i8> %data
+}
+
+; Code generate load of an illegal datatype via promotion.
+define <vscale x 2 x i16> @masked_gather_nxv2i16(<vscale x 2 x i16*> %ptrs, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i16:
+; CHECK: ld1sh { z0.d }, p0/z, [z0.d]
+; CHECK: ret
+  %data = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  ret <vscale x 2 x i16> %data
+}
+
 ; Code generate load of an illegal datatype via promotion.
 define <vscale x 2 x i32> @masked_gather_nxv2i32(<vscale x 2 x i32*> %ptrs, <vscale x 2 x i1> %mask) {
 ; CHECK-LABEL: masked_gather_nxv2i32:
-; CHECK-DAG: mov x8, xzr
-; CHECK-DAG: ld1sw { z0.d }, p0/z, [x8, z0.d]
-; CHECK:     ret
+; CHECK: ld1sw { z0.d }, p0/z, [z0.d]
+; CHECK: ret
  %data = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
  ret <vscale x 2 x i32> %data
 }
@ -92,11 +109,10 @@ define <vscale x 32 x i32> @masked_gather_nxv32i32(i32* %base, <vscale x 32 x i3
 define <vscale x 4 x i32> @masked_sgather_nxv4i8(<vscale x 4 x i8*> %ptrs, <vscale x 4 x i1> %mask) {
 ; CHECK-LABEL: masked_sgather_nxv4i8:
 ; CHECK:         pfalse p1.b
-; CHECK-NEXT:    mov x8, xzr
 ; CHECK-NEXT:    zip2 p2.s, p0.s, p1.s
 ; CHECK-NEXT:    zip1 p0.s, p0.s, p1.s
-; CHECK-NEXT:    ld1sb { z1.d }, p2/z, [x8, z1.d]
-; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ld1sb { z1.d }, p2/z, [z1.d]
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [z0.d]
 ; CHECK-NEXT:    ptrue p0.s
 ; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
 ; CHECK-NEXT:    sxtb z0.s, p0/m, z0.s
@ -109,8 +125,6 @@ define <vscale x 4 x i32> @masked_sgather_nxv4i8(<vscale x 4 x i8*> %ptrs, <vsca
 declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
 declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
 declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
-
 declare <vscale x 4 x i8> @llvm.masked.gather.nxv4i8(<vscale x 4 x i8*>, i32, <vscale x 4 x i1>, <vscale x 4 x i8>)
-
 declare <vscale x 16 x i8> @llvm.masked.gather.nxv16i8(<vscale x 16 x i8*>, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
 declare <vscale x 32 x i32> @llvm.masked.gather.nxv32i32(<vscale x 32 x i32*>, i32, <vscale x 32 x i1>, <vscale x 32 x i32>)
--- a/test/CodeGen/AArch64/sve-masked-gather-vec-plus-imm.ll
+++ b/test/CodeGen/AArch64/sve-masked-gather-vec-plus-imm.ll
@ -0,0 +1,186 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 2 x i64> @masked_gather_nxv2i8(<vscale x 2 x i8*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [z0.d, #1]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i32 1
+  %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %vals.zext = zext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i16(<vscale x 2 x i16*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d, #2]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i16, <vscale x 2 x i16*> %bases, i32 1
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i32(<vscale x 2 x i32*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d, #4]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i32, <vscale x 2 x i32*> %bases, i32 1
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i64(<vscale x 2 x i64*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [z0.d, #8]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i64, <vscale x 2 x i64*> %bases, i32 1
+  %vals.zext = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x half> @masked_gather_nxv2f16(<vscale x 2 x half*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d, #4]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr half, <vscale x 2 x half*> %bases, i32 2
+  %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
+  ret <vscale x 2 x half> %vals
+}
+
+define <vscale x 2 x bfloat> @masked_gather_nxv2bf16(<vscale x 2 x bfloat*> %bases, <vscale x 2 x i1> %mask) #0 {
+; CHECK-LABEL: masked_gather_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d, #4]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr bfloat, <vscale x 2 x bfloat*> %bases, i32 2
+  %vals = call <vscale x 2 x bfloat> @llvm.masked.gather.nxv2bf16(<vscale x 2 x bfloat*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x bfloat> undef)
+  ret <vscale x 2 x bfloat> %vals
+}
+
+define <vscale x 2 x float> @masked_gather_nxv2f32(<vscale x 2 x float*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d, #12]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr float, <vscale x 2 x float*> %bases, i32 3
+  %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+  ret <vscale x 2 x float> %vals
+}
+
+define <vscale x 2 x double> @masked_gather_nxv2f64(<vscale x 2 x double*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [z0.d, #32]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr double, <vscale x 2 x double*> %bases, i32 4
+  %vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+  ret <vscale x 2 x double> %vals
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i8(<vscale x 2 x i8*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [z0.d, #5]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i32 5
+  %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %vals.sext = sext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i16(<vscale x 2 x i16*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [z0.d, #12]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i16, <vscale x 2 x i16*> %bases, i32 6
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i32(<vscale x 2 x i32*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [z0.d, #28]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i32, <vscale x 2 x i32*> %bases, i32 7
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+; Tests where the immediate is out of range
+
+define <vscale x 2 x i64> @masked_gather_nxv2i8_range(<vscale x 2 x i8*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i8_range:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #32
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i32 32
+  %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %vals.zext = zext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x half> @masked_gather_nxv2f16_range(<vscale x 2 x half*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f16_range:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #64
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr half, <vscale x 2 x half*> %bases, i32 32
+  %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
+  ret <vscale x 2 x half> %vals
+}
+
+define <vscale x 2 x bfloat> @masked_gather_nxv2bf16_range(<vscale x 2 x bfloat*> %bases, <vscale x 2 x i1> %mask) #0 {
+; CHECK-LABEL: masked_gather_nxv2bf16_range:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #64
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr bfloat, <vscale x 2 x bfloat*> %bases, i32 32
+  %vals = call <vscale x 2 x bfloat> @llvm.masked.gather.nxv2bf16(<vscale x 2 x bfloat*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x bfloat> undef)
+  ret <vscale x 2 x bfloat> %vals
+}
+
+define <vscale x 2 x float> @masked_gather_nxv2f32_range(<vscale x 2 x float*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f32_range:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #128
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr float, <vscale x 2 x float*> %bases, i32 32
+  %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+  ret <vscale x 2 x float> %vals
+}
+
+define <vscale x 2 x double> @masked_gather_nxv2f64_range(<vscale x 2 x double*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f64_range:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #256
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x8, z0.d]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr double, <vscale x 2 x double*> %bases, i32 32
+  %vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+  ret <vscale x 2 x double> %vals
+}
+
+declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x bfloat> @llvm.masked.gather.nxv2bf16(<vscale x 2 x bfloat*>, i32, <vscale x 2 x i1>, <vscale x 2 x bfloat>)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+attributes #0 = { "target-features"="+sve,+bf16" }
--- a/test/CodeGen/AArch64/sve-masked-gather-vec-plus-reg.ll
+++ b/test/CodeGen/AArch64/sve-masked-gather-vec-plus-reg.ll
@ -0,0 +1,137 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 2 x i64> @masked_gather_nxv2i8(<vscale x 2 x i8*> %bases, i64 %offset, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i64 %offset
+  %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %vals.zext = zext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i16(<vscale x 2 x i8*> %bases, i64 %offset, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i64 %offset
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i32(<vscale x 2 x i8*> %bases, i64 %offset, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i64 %offset
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i64(<vscale x 2 x i8*> %bases, i64 %offset, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i64 %offset
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
+  %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+  ret <vscale x 2 x i64> %vals
+}
+
+define <vscale x 2 x half> @masked_gather_nxv2f16(<vscale x 2 x i8*> %bases, i64 %offset, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i64 %offset
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
+  %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
+  ret <vscale x 2 x half> %vals
+}
+
+define <vscale x 2 x bfloat> @masked_gather_nxv2bf16(<vscale x 2 x i8*> %bases, i64 %offset, <vscale x 2 x i1> %mask) #0 {
+; CHECK-LABEL: masked_gather_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i64 %offset
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x bfloat*>
+  %vals = call <vscale x 2 x bfloat> @llvm.masked.gather.nxv2bf16(<vscale x 2 x bfloat*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x bfloat> undef)
+  ret <vscale x 2 x bfloat> %vals
+}
+
+define <vscale x 2 x float> @masked_gather_nxv2f32(<vscale x 2 x i8*> %bases, i64 %offset, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i64 %offset
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
+  %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+  ret <vscale x 2 x float> %vals
+}
+
+define <vscale x 2 x double> @masked_gather_nxv2f64(<vscale x 2 x i8*> %bases, i64 %offset, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i64 %offset
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
+  %vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+  ret <vscale x 2 x double> %vals
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i8(<vscale x 2 x i8*> %bases, i64 %offset, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i64 %offset
+  %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %vals.sext = sext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i16(<vscale x 2 x i8*> %bases, i64 %offset, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i64 %offset
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i32(<vscale x 2 x i8*> %bases, i64 %offset, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [x0, z0.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i64 %offset
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x bfloat> @llvm.masked.gather.nxv2bf16(<vscale x 2 x bfloat*>, i32, <vscale x 2 x i1>, <vscale x 2 x bfloat>)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+attributes #0 = { "target-features"="+sve,+bf16" }
--- a/test/CodeGen/AArch64/sve-masked-gather.ll
+++ b/test/CodeGen/AArch64/sve-masked-gather.ll
@ -0,0 +1,117 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define <vscale x 2 x i64> @masked_gather_nxv2i8(<vscale x 2 x i8*> %ptrs, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1b { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT:    ret
+  %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %vals.zext = zext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i16(<vscale x 2 x i16*> %ptrs, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT:    ret
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.zext = zext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i32(<vscale x 2 x i32*> %ptrs, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT:    ret
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.zext = zext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.zext
+}
+
+define <vscale x 2 x i64> @masked_gather_nxv2i64(<vscale x 2 x i64*> %ptrs, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT:    ret
+  %vals = call <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x i64> undef)
+  ret <vscale x 2 x i64> %vals
+}
+
+define <vscale x 2 x half> @masked_gather_nxv2f16(<vscale x 2 x half*> %ptrs, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT:    ret
+  %vals = call <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x half> undef)
+  ret <vscale x 2 x half> %vals
+}
+
+define <vscale x 2 x bfloat> @masked_gather_nxv2bf16(<vscale x 2 x bfloat*> %ptrs, <vscale x 2 x i1> %mask) #0 {
+; CHECK-LABEL: masked_gather_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1h { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT:    ret
+  %vals = call <vscale x 2 x bfloat> @llvm.masked.gather.nxv2bf16(<vscale x 2 x bfloat*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x bfloat> undef)
+  ret <vscale x 2 x bfloat> %vals
+}
+
+define <vscale x 2 x float> @masked_gather_nxv2f32(<vscale x 2 x float*> %ptrs, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT:    ret
+  %vals = call <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x float> undef)
+  ret <vscale x 2 x float> %vals
+}
+
+define <vscale x 2 x double> @masked_gather_nxv2f64(<vscale x 2 x double*> %ptrs, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_gather_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1d { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT:    ret
+  %vals = call <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask, <vscale x 2 x double> undef)
+  ret <vscale x 2 x double> %vals
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i8(<vscale x 2 x i8*> %ptrs, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sb { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT:    ret
+  %vals = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask, <vscale x 2 x i8> undef)
+  %vals.sext = sext <vscale x 2 x i8> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i16(<vscale x 2 x i16*> %ptrs, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sh { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT:    ret
+  %vals = call <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask, <vscale x 2 x i16> undef)
+  %vals.sext = sext <vscale x 2 x i16> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+define <vscale x 2 x i64> @masked_sgather_nxv2i32(<vscale x 2 x i32*> %ptrs, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_sgather_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ld1sw { z0.d }, p0/z, [z0.d]
+; CHECK-NEXT:    ret
+  %vals = call <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask, <vscale x 2 x i32> undef)
+  %vals.sext = sext <vscale x 2 x i32> %vals to <vscale x 2 x i64>
+  ret <vscale x 2 x i64> %vals.sext
+}
+
+declare <vscale x 2 x i8> @llvm.masked.gather.nxv2i8(<vscale x 2 x i8*>, i32, <vscale x 2 x i1>, <vscale x 2 x i8>)
+declare <vscale x 2 x i16> @llvm.masked.gather.nxv2i16(<vscale x 2 x i16*>, i32, <vscale x 2 x i1>, <vscale x 2 x i16>)
+declare <vscale x 2 x i32> @llvm.masked.gather.nxv2i32(<vscale x 2 x i32*>, i32, <vscale x 2 x i1>, <vscale x 2 x i32>)
+declare <vscale x 2 x i64> @llvm.masked.gather.nxv2i64(<vscale x 2 x i64*>, i32, <vscale x 2 x i1>, <vscale x 2 x i64>)
+declare <vscale x 2 x half> @llvm.masked.gather.nxv2f16(<vscale x 2 x half*>, i32, <vscale x 2 x i1>, <vscale x 2 x half>)
+declare <vscale x 2 x bfloat> @llvm.masked.gather.nxv2bf16(<vscale x 2 x bfloat*>, i32, <vscale x 2 x i1>, <vscale x 2 x bfloat>)
+declare <vscale x 2 x float> @llvm.masked.gather.nxv2f32(<vscale x 2 x float*>, i32, <vscale x 2 x i1>, <vscale x 2 x float>)
+declare <vscale x 2 x double> @llvm.masked.gather.nxv2f64(<vscale x 2 x double*>, i32, <vscale x 2 x i1>, <vscale x 2 x double>)
+attributes #0 = { "target-features"="+sve,+bf16" }
--- a/test/CodeGen/AArch64/sve-masked-scatter-vec-plus-imm.ll
+++ b/test/CodeGen/AArch64/sve-masked-scatter-vec-plus-imm.ll
@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define void @masked_scatter_nxv2i8(<vscale x 2 x i8> %data, <vscale x 2 x i8*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1b { z0.d }, p0, [z1.d, #1]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i32 1
+  call void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8> %data, <vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d, #2]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i16, <vscale x 2 x i16*> %bases, i32 1
+  call void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1w { z0.d }, p0, [z1.d, #4]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i32, <vscale x 2 x i32*> %bases, i32 1
+  call void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d, #8]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i64, <vscale x 2 x i64*> %bases, i32 1
+  call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d, #4]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr half, <vscale x 2 x half*> %bases, i32 2
+  call void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv2bf16(<vscale x 2 x bfloat> %data, <vscale x 2 x bfloat*> %bases, <vscale x 2 x i1> %mask) #0 {
+; CHECK-LABEL: masked_scatter_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d, #4]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr bfloat, <vscale x 2 x bfloat*> %bases, i32 2
+  call void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat> %data, <vscale x 2 x bfloat*> %ptrs, i32 2, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1w { z0.d }, p0, [z1.d, #12]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr float, <vscale x 2 x float*> %bases, i32 3
+  call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d, #32]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr double, <vscale x 2 x double*> %bases, i32 4
+  call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+; Test where the immediate is out of range
+
+define void @masked_scatter_nxv2i8_range(<vscale x 2 x i8> %data, <vscale x 2 x i8*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2i8_range:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #32
+; CHECK-NEXT:    st1b { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i32 32
+  call void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8> %data, <vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv2i16_range(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2i16_range:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #64
+; CHECK-NEXT:    st1h { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i16, <vscale x 2 x i16*> %bases, i32 32
+  call void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv2i32_range(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2i32_range:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #128
+; CHECK-NEXT:    st1w { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i32, <vscale x 2 x i32*> %bases, i32 32
+  call void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, i32 1, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv2f64_range(<vscale x 2 x double> %data, <vscale x 2 x double*> %bases, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2f64_range:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #256
+; CHECK-NEXT:    st1d { z0.d }, p0, [x8, z1.d]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr double, <vscale x 2 x double*> %bases, i32 32
+  call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+declare void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8*>,  i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double*>, i32, <vscale x 2 x i1>)
+attributes #0 = { "target-features"="+sve,+bf16" }
--- a/test/CodeGen/AArch64/sve-masked-scatter-vec-plus-reg.ll
+++ b/test/CodeGen/AArch64/sve-masked-scatter-vec-plus-reg.ll
@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s
+
+define void @masked_scatter_nxv2i8(<vscale x 2 x i8> %data, <vscale x 2 x i8*> %bases, i64 %offset, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1b { z0.d }, p0, [x0, z1.d]
+; CHECK-NEXT:    ret
+  %ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i64 %offset
+  call void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8> %data, <vscale x 2 x i8*> %ptrs, i32 1, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i8*> %bases, i64 %offset, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1h { z0.d }, p0, [x0, z1.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i64 %offset
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i16*>
+  call void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, i32 2, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i8*> %bases, i64 %offset, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1w { z0.d }, p0, [x0, z1.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i64 %offset
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i32*>
+  call void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, i32 4, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i8*> %bases, i64 %offset, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0, z1.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i64 %offset
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x i64*>
+  call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %ptrs, i32 8, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x i8*> %bases, i64 %offset, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1h { z0.d }, p0, [x0, z1.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i64 %offset
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x half*>
+  call void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %ptrs, i32 2, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv2bf16(<vscale x 2 x bfloat> %data, <vscale x 2 x i8*> %bases, i64 %offset, <vscale x 2 x i1> %mask) #0 {
+; CHECK-LABEL: masked_scatter_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1h { z0.d }, p0, [x0, z1.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i64 %offset
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x bfloat*>
+  call void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat> %data, <vscale x 2 x bfloat*> %ptrs, i32 2, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x i8*> %bases, i64 %offset, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1w { z0.d }, p0, [x0, z1.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i64 %offset
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x float*>
+  call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %ptrs, i32 4, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+define void @masked_scatter_nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x i8*> %bases, i64 %offset, <vscale x 2 x i1> %mask) {
+; CHECK-LABEL: masked_scatter_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1d { z0.d }, p0, [x0, z1.d]
+; CHECK-NEXT:    ret
+  %byte_ptrs = getelementptr i8, <vscale x 2 x i8*> %bases, i64 %offset
+  %ptrs = bitcast <vscale x 2 x i8*> %byte_ptrs to <vscale x 2 x double*>
+  call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, i32 8, <vscale x 2 x i1> %mask)
+  ret void
+}
+
+declare void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8*>,  i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double*>, i32, <vscale x 2 x i1>)
+attributes #0 = { "target-features"="+sve,+bf16" }
--- a/test/CodeGen/AArch64/sve-masked-scatter.ll
+++ b/test/CodeGen/AArch64/sve-masked-scatter.ll
@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve  < %s | FileCheck %s
+
+define void @masked_scatter_nxv2i8(<vscale x 2 x i8> %data, <vscale x 2 x i8*> %ptrs, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1b { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8> %data, <vscale x 2 x i8*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+define void @masked_scatter_nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16> %data, <vscale x 2 x i16*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+define void @masked_scatter_nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1w { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32> %data, <vscale x 2 x i32*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+define void @masked_scatter_nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %ptrs, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i64*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+define void @masked_scatter_nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %ptrs, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half> %data, <vscale x 2 x half*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+define void @masked_scatter_nxv2bf16(<vscale x 2 x bfloat> %data, <vscale x 2 x bfloat*> %ptrs, <vscale x 2 x i1> %masks) nounwind #0 {
+; CHECK-LABEL: masked_scatter_nxv2bf16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1h { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat> %data, <vscale x 2 x bfloat*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+define void @masked_scatter_nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %ptrs, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1w { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float> %data, <vscale x 2 x float*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+define void @masked_scatter_nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, <vscale x 2 x i1> %masks) nounwind {
+; CHECK-LABEL: masked_scatter_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d]
+; CHECK-NEXT:    ret
+  call void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x double*> %ptrs, i32 0, <vscale x 2 x i1> %masks)
+  ret void
+}
+
+declare void @llvm.masked.scatter.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i16(<vscale x 2 x i16>, <vscale x 2 x i16*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i32(<vscale x 2 x i32>, <vscale x 2 x i32*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64*>, i32, <vscale x 2 x i1>)
+declare void @llvm.masked.scatter.nxv2i8(<vscale x 2 x i8>, <vscale x 2 x i8*>, i32, <vscale x 2 x i1>)
+attributes #0 = { "target-features"="+sve,+bf16" }