[CodeGen][AArch64][SVE] Use ld1r[bhsd] for vector splat from memory

This avoids the use of the vector unit for copying from scalar to vector. There is an extra ptrue instruction, but a predicate register with the ptrue pattern populated is likely to be free in the context of real code. Tests were generated from a template to cover the axes mentioned at the top of the test file. Co-authored-by: Francesco Petrogalli <francesco.petrogalli@arm.com> Differential Revision: https://reviews.llvm.org/D103170
2024-10-18 18:42:46 +02:00 · 2021-05-26 14:18:27 +00:00 · 2021-05-26 14:18:27 +00:00 · f69dc8533f
commit f69dc8533f
parent 4e40134cc8
8 changed files with 1279 additions and 120 deletions
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@ -128,6 +128,24 @@ public:
  bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) {
    return SelectAddrModeUnscaled(N, 16, Base, OffImm);
  }
+  template <unsigned Size, unsigned Max>
+  bool SelectAddrModeIndexedUImm(SDValue N, SDValue &Base, SDValue &OffImm) {
+    // Test if there is an appropriate addressing mode and check if the
+    // immediate fits.
+    bool Found = SelectAddrModeIndexed(N, Size, Base, OffImm);
+    if (Found) {
+      if (auto *CI = dyn_cast<ConstantSDNode>(OffImm)) {
+        int64_t C = CI->getSExtValue();
+        if (C <= Max)
+          return true;
+      }
+    }
+
+    // Otherwise, base only, materialize address in register.
+    Base = N;
+    OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i64);
+    return true;
+  }

  template<int Width>
  bool SelectAddrModeWRO(SDValue N, SDValue &Base, SDValue &Offset,
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@ -3127,6 +3127,13 @@ def am_indexed32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed32", []>;
 def am_indexed64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed64", []>;
 def am_indexed128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed128", []>;

+// (unsigned immediate)
+// Indexed for 8-bit registers. offset is in range [0,63].
+def am_indexed8_6b : ComplexPattern<i64, 2, "SelectAddrModeIndexedUImm<1,63>", []>;
+def am_indexed16_6b : ComplexPattern<i64, 2, "SelectAddrModeIndexedUImm<2,63>", []>;
+def am_indexed32_6b : ComplexPattern<i64, 2, "SelectAddrModeIndexedUImm<4,63>", []>;
+def am_indexed64_6b : ComplexPattern<i64, 2, "SelectAddrModeIndexedUImm<8,63>", []>;
+
 def gi_am_indexed8 :
    GIComplexOperandMatcher<s64, "selectAddrModeIndexed<8>">,
    GIComplexPatternEquiv<am_indexed8>;
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@ -2300,6 +2300,22 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
  case AArch64::LD1B_D_IMM:
  case AArch64::LD1SB_D_IMM:
  case AArch64::ST1B_D_IMM:
+  case AArch64::LD1RB_IMM:
+  case AArch64::LD1RB_H_IMM:
+  case AArch64::LD1RB_S_IMM:
+  case AArch64::LD1RB_D_IMM:
+  case AArch64::LD1RSB_H_IMM:
+  case AArch64::LD1RSB_S_IMM:
+  case AArch64::LD1RSB_D_IMM:
+  case AArch64::LD1RH_IMM:
+  case AArch64::LD1RH_S_IMM:
+  case AArch64::LD1RH_D_IMM:
+  case AArch64::LD1RSH_S_IMM:
+  case AArch64::LD1RSH_D_IMM:
+  case AArch64::LD1RW_IMM:
+  case AArch64::LD1RW_D_IMM:
+  case AArch64::LD1RSW_IMM:
+  case AArch64::LD1RD_IMM:
    return 3;
  case AArch64::ADDG:
  case AArch64::STGOffset:
@ -2913,6 +2929,42 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
    MinOffset = -64;
    MaxOffset = 63;
    break;
+  case AArch64::LD1RB_IMM:
+  case AArch64::LD1RB_H_IMM:
+  case AArch64::LD1RB_S_IMM:
+  case AArch64::LD1RB_D_IMM:
+  case AArch64::LD1RSB_H_IMM:
+  case AArch64::LD1RSB_S_IMM:
+  case AArch64::LD1RSB_D_IMM:
+    Scale = TypeSize::Fixed(1);
+    Width = 1;
+    MinOffset = 0;
+    MaxOffset = 63;
+    break;
+  case AArch64::LD1RH_IMM:
+  case AArch64::LD1RH_S_IMM:
+  case AArch64::LD1RH_D_IMM:
+  case AArch64::LD1RSH_S_IMM:
+  case AArch64::LD1RSH_D_IMM:
+    Scale = TypeSize::Fixed(2);
+    Width = 2;
+    MinOffset = 0;
+    MaxOffset = 63;
+    break;
+  case AArch64::LD1RW_IMM:
+  case AArch64::LD1RW_D_IMM:
+  case AArch64::LD1RSW_IMM:
+    Scale = TypeSize::Fixed(4);
+    Width = 4;
+    MinOffset = 0;
+    MaxOffset = 63;
+    break;
+  case AArch64::LD1RD_IMM:
+    Scale = TypeSize::Fixed(8);
+    Width = 8;
+    MinOffset = 0;
+    MaxOffset = 63;
+    break;
  }

  return true;
--- a/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/lib/Target/AArch64/AArch64SVEInstrInfo.td
@ -1643,6 +1643,45 @@ let Predicates = [HasSVE] in {
  def : Pat<(AArch64ptest (nxv2i1 PPR:$pg), (nxv2i1 PPR:$src)),
            (PTEST_PP PPR:$pg, PPR:$src)>;

+  let AddedComplexity = 1 in {
+  class LD1RPat<ValueType vt, SDPatternOperator operator,
+                Instruction load, Instruction ptrue, ValueType index_vt, ComplexPattern CP, Operand immtype> :
+        Pat<(vt (AArch64dup (index_vt (operator (CP GPR64:$base, immtype:$offset))))),
+            (load (ptrue 31), GPR64:$base, $offset)>;
+  }
+
+  // LDR1 of 8-bit data
+  def : LD1RPat<nxv16i8, extloadi8,  LD1RB_IMM,    PTRUE_B, i32, am_indexed8_6b, uimm6s1>;
+  def : LD1RPat<nxv8i16, zextloadi8, LD1RB_H_IMM,  PTRUE_H, i32, am_indexed8_6b, uimm6s1>;
+  def : LD1RPat<nxv4i32, zextloadi8, LD1RB_S_IMM,  PTRUE_S, i32, am_indexed8_6b, uimm6s1>;
+  def : LD1RPat<nxv2i64, zextloadi8, LD1RB_D_IMM,  PTRUE_D, i64, am_indexed8_6b, uimm6s1>;
+  def : LD1RPat<nxv8i16, sextloadi8, LD1RSB_H_IMM, PTRUE_H, i32, am_indexed8_6b, uimm6s1>;
+  def : LD1RPat<nxv4i32, sextloadi8, LD1RSB_S_IMM, PTRUE_S, i32, am_indexed8_6b, uimm6s1>;
+  def : LD1RPat<nxv2i64, sextloadi8, LD1RSB_D_IMM, PTRUE_D, i64, am_indexed8_6b, uimm6s1>;
+
+  // LDR1 of 16-bit data
+  def : LD1RPat<nxv8i16, extloadi16,  LD1RH_IMM,    PTRUE_H, i32, am_indexed16_6b, uimm6s2>;
+  def : LD1RPat<nxv4i32, zextloadi16, LD1RH_S_IMM,  PTRUE_S, i32, am_indexed16_6b, uimm6s2>;
+  def : LD1RPat<nxv2i64, zextloadi16, LD1RH_D_IMM,  PTRUE_D, i64, am_indexed16_6b, uimm6s2>;
+  def : LD1RPat<nxv4i32, sextloadi16, LD1RSH_S_IMM, PTRUE_S, i32, am_indexed16_6b, uimm6s2>;
+  def : LD1RPat<nxv2i64, sextloadi16, LD1RSH_D_IMM, PTRUE_D, i64, am_indexed16_6b, uimm6s2>;
+
+  // LDR1 of 32-bit data
+  def : LD1RPat<nxv4i32, load,        LD1RW_IMM,   PTRUE_S, i32, am_indexed32_6b, uimm6s4>;
+  def : LD1RPat<nxv2i64, zextloadi32, LD1RW_D_IMM, PTRUE_D, i64, am_indexed32_6b, uimm6s4>;
+  def : LD1RPat<nxv2i64, sextloadi32, LD1RSW_IMM,  PTRUE_D, i64, am_indexed32_6b, uimm6s4>;
+
+  // LDR1 of 64-bit data
+  def : LD1RPat<nxv2i64, load, LD1RD_IMM, PTRUE_D, i64, am_indexed64_6b, uimm6s8>;
+
+  // LD1R of FP data
+  def : LD1RPat<nxv8f16, load, LD1RH_IMM,   PTRUE_H, f16, am_indexed16_6b, uimm6s2>;
+  def : LD1RPat<nxv4f16, load, LD1RH_S_IMM, PTRUE_S, f16, am_indexed16_6b, uimm6s2>;
+  def : LD1RPat<nxv2f16, load, LD1RH_D_IMM, PTRUE_D, f16, am_indexed16_6b, uimm6s2>;
+  def : LD1RPat<nxv4f32, load, LD1RW_IMM,   PTRUE_S, f32, am_indexed32_6b, uimm6s4>;
+  def : LD1RPat<nxv2f32, load, LD1RW_D_IMM, PTRUE_D, f32, am_indexed32_6b, uimm6s4>;
+  def : LD1RPat<nxv2f64, load, LD1RD_IMM,   PTRUE_D, f64, am_indexed64_6b, uimm6s8>;
+
  // LD1R of 128-bit masked data
  def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
            (LD1RQ_B_IMM $gp, $base, (i64 0))>;
--- a/test/CodeGen/AArch64/sve-ld-post-inc.ll
+++ b/test/CodeGen/AArch64/sve-ld-post-inc.ll
@ -23,10 +23,10 @@ define <vscale x 4 x i32> @test_post_ld1_insert(i32* %a, i32** %ptr, i64 %inc) {
 define <vscale x 2 x double> @test_post_ld1_dup(double* %a, double** %ptr, i64 %inc) {
 ; CHECK-LABEL: test_post_ld1_dup:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x0]
 ; CHECK-NEXT:    add x8, x0, x2, lsl #3
 ; CHECK-NEXT:    str x8, [x1]
-; CHECK-NEXT:    mov z0.d, d0
 ; CHECK-NEXT:    ret
  %load = load double, double* %a
  %dup = call <vscale x 2 x double> @llvm.aarch64.sve.dup.x.nxv2f64(double %load)
--- a/test/CodeGen/AArch64/sve-ld1r.ll
+++ b/test/CodeGen/AArch64/sve-ld1r.ll
@ -0,0 +1,724 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
+;
+; Check that ldr1* instruction is generated to splat scalar during load,
+; rather than mov from scalar to vector register (which would require the vector unit).
+;
+; one-off: ld1r_stack checks that ldr1b works with stack objects.
+;
+; Test axes:
+;   types = [i8, i16, i32, i64, half, float, double]
+;   methods = [direct load, gep upper bound - 1, gep out of range x {neg,pos}, sext..., zext..., unpacked_floats...]
+;
+
+@g8 = external global i8
+
+; One-off test for splatted value coming from stack load.
+define <vscale x 16 x i8> @ld1r_stack() {
+; CHECK-LABEL: ld1r_stack:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16 // =16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    adrp x8, :got:g8
+; CHECK-NEXT:    ldr x8, [x8, :got_lo12:g8]
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ldrb w8, [x8]
+; CHECK-NEXT:    strb w8, [sp, #12]
+; CHECK-NEXT:    ld1rb { z0.b }, p0/z, [sp, #14]
+; CHECK-NEXT:    add sp, sp, #16 // =16
+; CHECK-NEXT:    ret
+  %valp = alloca i8
+  %valp2  = load volatile i8, i8* @g8
+  store volatile i8 %valp2, i8* %valp
+  %valp3 = getelementptr i8, i8* %valp, i32 2
+  %val = load i8, i8* %valp3
+  %1 = insertelement <vscale x 16 x i8> undef, i8 %val, i32 0
+  %2 = shufflevector <vscale x 16 x i8> %1, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
+  ret <vscale x 16 x i8> %2
+}
+
+define <vscale x 16 x i8> @ld1rb(i8* %valp) {
+; CHECK-LABEL: ld1rb:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ld1rb { z0.b }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load i8, i8* %valp
+  %ins = insertelement <vscale x 16 x i8> undef, i8 %val, i32 0
+  %shf = shufflevector <vscale x 16 x i8> %ins, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
+  ret <vscale x 16 x i8> %shf
+}
+
+define <vscale x 16 x i8> @ld1rb_gep(i8* %valp) {
+; CHECK-LABEL: ld1rb_gep:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ld1rb { z0.b }, p0/z, [x0, #63]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr i8, i8* %valp, i32 63
+  %val = load i8, i8* %valp2
+  %ins = insertelement <vscale x 16 x i8> undef, i8 %val, i32 0
+  %shf = shufflevector <vscale x 16 x i8> %ins, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
+  ret <vscale x 16 x i8> %shf
+}
+
+define <vscale x 16 x i8> @ld1rb_gep_out_of_range_up(i8* %valp) {
+; CHECK-LABEL: ld1rb_gep_out_of_range_up:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #64 // =64
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ld1rb { z0.b }, p0/z, [x8]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr i8, i8* %valp, i32 64
+  %val = load i8, i8* %valp2
+  %ins = insertelement <vscale x 16 x i8> undef, i8 %val, i32 0
+  %shf = shufflevector <vscale x 16 x i8> %ins, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
+  ret <vscale x 16 x i8> %shf
+}
+
+define <vscale x 16 x i8> @ld1rb_gep_out_of_range_down(i8* %valp) {
+; CHECK-LABEL: ld1rb_gep_out_of_range_down:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub x8, x0, #1 // =1
+; CHECK-NEXT:    ptrue p0.b
+; CHECK-NEXT:    ld1rb { z0.b }, p0/z, [x8]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr i8, i8* %valp, i32 -1
+  %val = load i8, i8* %valp2
+  %ins = insertelement <vscale x 16 x i8> undef, i8 %val, i32 0
+  %shf = shufflevector <vscale x 16 x i8> %ins, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
+  ret <vscale x 16 x i8> %shf
+}
+
+define <vscale x 8 x i16> @ld1rb_i8_i16_zext(i8* %valp) {
+; CHECK-LABEL: ld1rb_i8_i16_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rb { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load i8, i8* %valp
+  %ext = zext i8 %val to i16
+  %ins = insertelement <vscale x 8 x i16> undef, i16 %ext, i32 0
+  %shf = shufflevector <vscale x 8 x i16> %ins, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x i16> %shf
+}
+
+define <vscale x 8 x i16> @ld1rb_i8_i16_sext(i8* %valp) {
+; CHECK-LABEL: ld1rb_i8_i16_sext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rsb { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load i8, i8* %valp
+  %ext = sext i8 %val to i16
+  %ins = insertelement <vscale x 8 x i16> undef, i16 %ext, i32 0
+  %shf = shufflevector <vscale x 8 x i16> %ins, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x i16> %shf
+}
+
+define <vscale x 4 x i32> @ld1rb_i8_i32_zext(i8* %valp) {
+; CHECK-LABEL: ld1rb_i8_i32_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rb { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load i8, i8* %valp
+  %ext = zext i8 %val to i32
+  %ins = insertelement <vscale x 4 x i32> undef, i32 %ext, i32 0
+  %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x i32> %shf
+}
+
+define <vscale x 4 x i32> @ld1rb_i8_i32_sext(i8* %valp) {
+; CHECK-LABEL: ld1rb_i8_i32_sext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rsb { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load i8, i8* %valp
+  %ext = sext i8 %val to i32
+  %ins = insertelement <vscale x 4 x i32> undef, i32 %ext, i32 0
+  %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x i32> %shf
+}
+
+define <vscale x 2 x i64> @ld1rb_i8_i64_zext(i8* %valp) {
+; CHECK-LABEL: ld1rb_i8_i64_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rb { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load i8, i8* %valp
+  %ext = zext i8 %val to i64
+  %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
+  %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x i64> %shf
+}
+
+define <vscale x 2 x i64> @ld1rb_i8_i64_sext(i8* %valp) {
+; CHECK-LABEL: ld1rb_i8_i64_sext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rsb { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load i8, i8* %valp
+  %ext = sext i8 %val to i64
+  %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
+  %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x i64> %shf
+}
+
+define <vscale x 8 x i16> @ld1rh(i16* %valp) {
+; CHECK-LABEL: ld1rh:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load i16, i16* %valp
+  %ins = insertelement <vscale x 8 x i16> undef, i16 %val, i32 0
+  %shf = shufflevector <vscale x 8 x i16> %ins, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x i16> %shf
+}
+
+define <vscale x 8 x i16> @ld1rh_gep(i16* %valp) {
+; CHECK-LABEL: ld1rh_gep:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x0, #126]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr i16, i16* %valp, i32 63
+  %val = load i16, i16* %valp2
+  %ins = insertelement <vscale x 8 x i16> undef, i16 %val, i32 0
+  %shf = shufflevector <vscale x 8 x i16> %ins, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x i16> %shf
+}
+
+define <vscale x 8 x i16> @ld1rh_gep_out_of_range_up(i16* %valp) {
+; CHECK-LABEL: ld1rh_gep_out_of_range_up:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #128 // =128
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x8]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr i16, i16* %valp, i32 64
+  %val = load i16, i16* %valp2
+  %ins = insertelement <vscale x 8 x i16> undef, i16 %val, i32 0
+  %shf = shufflevector <vscale x 8 x i16> %ins, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x i16> %shf
+}
+
+define <vscale x 8 x i16> @ld1rh_gep_out_of_range_down(i16* %valp) {
+; CHECK-LABEL: ld1rh_gep_out_of_range_down:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub x8, x0, #2 // =2
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x8]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr i16, i16* %valp, i32 -1
+  %val = load i16, i16* %valp2
+  %ins = insertelement <vscale x 8 x i16> undef, i16 %val, i32 0
+  %shf = shufflevector <vscale x 8 x i16> %ins, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x i16> %shf
+}
+
+define <vscale x 4 x i32> @ld1rh_i16_i32_zext(i16* %valp) {
+; CHECK-LABEL: ld1rh_i16_i32_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rh { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load i16, i16* %valp
+  %ext = zext i16 %val to i32
+  %ins = insertelement <vscale x 4 x i32> undef, i32 %ext, i32 0
+  %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x i32> %shf
+}
+
+define <vscale x 4 x i32> @ld1rh_i16_i32_sext(i16* %valp) {
+; CHECK-LABEL: ld1rh_i16_i32_sext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rsh { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load i16, i16* %valp
+  %ext = sext i16 %val to i32
+  %ins = insertelement <vscale x 4 x i32> undef, i32 %ext, i32 0
+  %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x i32> %shf
+}
+
+define <vscale x 2 x i64> @ld1rh_i16_i64_zext(i16* %valp) {
+; CHECK-LABEL: ld1rh_i16_i64_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rh { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load i16, i16* %valp
+  %ext = zext i16 %val to i64
+  %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
+  %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x i64> %shf
+}
+
+define <vscale x 2 x i64> @ld1rh_i16_i64_sext(i16* %valp) {
+; CHECK-LABEL: ld1rh_i16_i64_sext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rsh { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load i16, i16* %valp
+  %ext = sext i16 %val to i64
+  %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
+  %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x i64> %shf
+}
+
+define <vscale x 4 x i32> @ld1rw(i32* %valp) {
+; CHECK-LABEL: ld1rw:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load i32, i32* %valp
+  %ins = insertelement <vscale x 4 x i32> undef, i32 %val, i32 0
+  %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x i32> %shf
+}
+
+define <vscale x 4 x i32> @ld1rw_gep(i32* %valp) {
+; CHECK-LABEL: ld1rw_gep:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x0, #252]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr i32, i32* %valp, i32 63
+  %val = load i32, i32* %valp2
+  %ins = insertelement <vscale x 4 x i32> undef, i32 %val, i32 0
+  %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x i32> %shf
+}
+
+define <vscale x 4 x i32> @ld1rw_gep_out_of_range_up(i32* %valp) {
+; CHECK-LABEL: ld1rw_gep_out_of_range_up:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #256 // =256
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x8]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr i32, i32* %valp, i32 64
+  %val = load i32, i32* %valp2
+  %ins = insertelement <vscale x 4 x i32> undef, i32 %val, i32 0
+  %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x i32> %shf
+}
+
+define <vscale x 4 x i32> @ld1rw_gep_out_of_range_down(i32* %valp) {
+; CHECK-LABEL: ld1rw_gep_out_of_range_down:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub x8, x0, #4 // =4
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x8]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr i32, i32* %valp, i32 -1
+  %val = load i32, i32* %valp2
+  %ins = insertelement <vscale x 4 x i32> undef, i32 %val, i32 0
+  %shf = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x i32> %shf
+}
+
+define <vscale x 2 x i64> @ld1rw_i32_i64_zext(i32* %valp) {
+; CHECK-LABEL: ld1rw_i32_i64_zext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rw { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load i32, i32* %valp
+  %ext = zext i32 %val to i64
+  %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
+  %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x i64> %shf
+}
+
+define <vscale x 2 x i64> @ld1rw_i32_i64_sext(i32* %valp) {
+; CHECK-LABEL: ld1rw_i32_i64_sext:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rsw { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load i32, i32* %valp
+  %ext = sext i32 %val to i64
+  %ins = insertelement <vscale x 2 x i64> undef, i64 %ext, i32 0
+  %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x i64> %shf
+}
+
+define <vscale x 2 x i64> @ld1rd(i64* %valp) {
+; CHECK-LABEL: ld1rd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load i64, i64* %valp
+  %ins = insertelement <vscale x 2 x i64> undef, i64 %val, i32 0
+  %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x i64> %shf
+}
+
+define <vscale x 2 x i64> @ld1rd_gep(i64* %valp) {
+; CHECK-LABEL: ld1rd_gep:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x0, #504]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr i64, i64* %valp, i32 63
+  %val = load i64, i64* %valp2
+  %ins = insertelement <vscale x 2 x i64> undef, i64 %val, i32 0
+  %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x i64> %shf
+}
+
+define <vscale x 2 x i64> @ld1rd_gep_out_of_range_up(i64* %valp) {
+; CHECK-LABEL: ld1rd_gep_out_of_range_up:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #512 // =512
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x8]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr i64, i64* %valp, i32 64
+  %val = load i64, i64* %valp2
+  %ins = insertelement <vscale x 2 x i64> undef, i64 %val, i32 0
+  %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x i64> %shf
+}
+
+define <vscale x 2 x i64> @ld1rd_gep_out_of_range_down(i64* %valp) {
+; CHECK-LABEL: ld1rd_gep_out_of_range_down:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub x8, x0, #8 // =8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x8]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr i64, i64* %valp, i32 -1
+  %val = load i64, i64* %valp2
+  %ins = insertelement <vscale x 2 x i64> undef, i64 %val, i32 0
+  %shf = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x i64> %shf
+}
+
+define <vscale x 8 x half> @ld1rh_half(half* %valp) {
+; CHECK-LABEL: ld1rh_half:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load half, half* %valp
+  %ins = insertelement <vscale x 8 x half> undef, half %val, i32 0
+  %shf = shufflevector <vscale x 8 x half> %ins, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x half> %shf
+}
+
+define <vscale x 8 x half> @ld1rh_half_gep(half* %valp) {
+; CHECK-LABEL: ld1rh_half_gep:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x0, #126]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr half, half* %valp, i32 63
+  %val = load half, half* %valp2
+  %ins = insertelement <vscale x 8 x half> undef, half %val, i32 0
+  %shf = shufflevector <vscale x 8 x half> %ins, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x half> %shf
+}
+
+define <vscale x 8 x half> @ld1rh_half_gep_out_of_range_up(half* %valp) {
+; CHECK-LABEL: ld1rh_half_gep_out_of_range_up:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #128 // =128
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x8]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr half, half* %valp, i32 64
+  %val = load half, half* %valp2
+  %ins = insertelement <vscale x 8 x half> undef, half %val, i32 0
+  %shf = shufflevector <vscale x 8 x half> %ins, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x half> %shf
+}
+
+define <vscale x 8 x half> @ld1rh_half_gep_out_of_range_down(half* %valp) {
+; CHECK-LABEL: ld1rh_half_gep_out_of_range_down:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub x8, x0, #2 // =2
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    ld1rh { z0.h }, p0/z, [x8]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr half, half* %valp, i32 -1
+  %val = load half, half* %valp2
+  %ins = insertelement <vscale x 8 x half> undef, half %val, i32 0
+  %shf = shufflevector <vscale x 8 x half> %ins, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
+  ret <vscale x 8 x half> %shf
+}
+
+define <vscale x 4 x half> @ld1rh_half_unpacked4(half* %valp) {
+; CHECK-LABEL: ld1rh_half_unpacked4:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rh { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load half, half* %valp
+  %ins = insertelement <vscale x 4 x half> undef, half %val, i32 0
+  %shf = shufflevector <vscale x 4 x half> %ins, <vscale x 4 x half> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x half> %shf
+}
+
+define <vscale x 4 x half> @ld1rh_half_unpacked4_gep(half* %valp) {
+; CHECK-LABEL: ld1rh_half_unpacked4_gep:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rh { z0.s }, p0/z, [x0, #126]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr half, half* %valp, i32 63
+  %val = load half, half* %valp2
+  %ins = insertelement <vscale x 4 x half> undef, half %val, i32 0
+  %shf = shufflevector <vscale x 4 x half> %ins, <vscale x 4 x half> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x half> %shf
+}
+
+define <vscale x 4 x half> @ld1rh_half_unpacked4_gep_out_of_range_up(half* %valp) {
+; CHECK-LABEL: ld1rh_half_unpacked4_gep_out_of_range_up:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #128 // =128
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rh { z0.s }, p0/z, [x8]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr half, half* %valp, i32 64
+  %val = load half, half* %valp2
+  %ins = insertelement <vscale x 4 x half> undef, half %val, i32 0
+  %shf = shufflevector <vscale x 4 x half> %ins, <vscale x 4 x half> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x half> %shf
+}
+
+define <vscale x 4 x half> @ld1rh_half_unpacked4_gep_out_of_range_down(half* %valp) {
+; CHECK-LABEL: ld1rh_half_unpacked4_gep_out_of_range_down:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub x8, x0, #2 // =2
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rh { z0.s }, p0/z, [x8]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr half, half* %valp, i32 -1
+  %val = load half, half* %valp2
+  %ins = insertelement <vscale x 4 x half> undef, half %val, i32 0
+  %shf = shufflevector <vscale x 4 x half> %ins, <vscale x 4 x half> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x half> %shf
+}
+
+define <vscale x 2 x half> @ld1rh_half_unpacked2(half* %valp) {
+; CHECK-LABEL: ld1rh_half_unpacked2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rh { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load half, half* %valp
+  %ins = insertelement <vscale x 2 x half> undef, half %val, i32 0
+  %shf = shufflevector <vscale x 2 x half> %ins, <vscale x 2 x half> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x half> %shf
+}
+
+define <vscale x 2 x half> @ld1rh_half_unpacked2_gep(half* %valp) {
+; CHECK-LABEL: ld1rh_half_unpacked2_gep:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rh { z0.d }, p0/z, [x0, #126]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr half, half* %valp, i32 63
+  %val = load half, half* %valp2
+  %ins = insertelement <vscale x 2 x half> undef, half %val, i32 0
+  %shf = shufflevector <vscale x 2 x half> %ins, <vscale x 2 x half> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x half> %shf
+}
+
+define <vscale x 2 x half> @ld1rh_half_unpacked2_gep_out_of_range_up(half* %valp) {
+; CHECK-LABEL: ld1rh_half_unpacked2_gep_out_of_range_up:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #128 // =128
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rh { z0.d }, p0/z, [x8]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr half, half* %valp, i32 64
+  %val = load half, half* %valp2
+  %ins = insertelement <vscale x 2 x half> undef, half %val, i32 0
+  %shf = shufflevector <vscale x 2 x half> %ins, <vscale x 2 x half> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x half> %shf
+}
+
+define <vscale x 2 x half> @ld1rh_half_unpacked2_gep_out_of_range_down(half* %valp) {
+; CHECK-LABEL: ld1rh_half_unpacked2_gep_out_of_range_down:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub x8, x0, #2 // =2
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rh { z0.d }, p0/z, [x8]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr half, half* %valp, i32 -1
+  %val = load half, half* %valp2
+  %ins = insertelement <vscale x 2 x half> undef, half %val, i32 0
+  %shf = shufflevector <vscale x 2 x half> %ins, <vscale x 2 x half> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x half> %shf
+}
+
+define <vscale x 4 x float> @ld1rw_float(float* %valp) {
+; CHECK-LABEL: ld1rw_float:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load float, float* %valp
+  %ins = insertelement <vscale x 4 x float> undef, float %val, i32 0
+  %shf = shufflevector <vscale x 4 x float> %ins, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x float> %shf
+}
+
+define <vscale x 4 x float> @ld1rw_float_gep(float* %valp) {
+; CHECK-LABEL: ld1rw_float_gep:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x0, #252]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr float, float* %valp, i32 63
+  %val = load float, float* %valp2
+  %ins = insertelement <vscale x 4 x float> undef, float %val, i32 0
+  %shf = shufflevector <vscale x 4 x float> %ins, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x float> %shf
+}
+
+define <vscale x 4 x float> @ld1rw_float_gep_out_of_range_up(float* %valp) {
+; CHECK-LABEL: ld1rw_float_gep_out_of_range_up:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #256 // =256
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x8]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr float, float* %valp, i32 64
+  %val = load float, float* %valp2
+  %ins = insertelement <vscale x 4 x float> undef, float %val, i32 0
+  %shf = shufflevector <vscale x 4 x float> %ins, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x float> %shf
+}
+
+define <vscale x 4 x float> @ld1rw_float_gep_out_of_range_down(float* %valp) {
+; CHECK-LABEL: ld1rw_float_gep_out_of_range_down:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub x8, x0, #4 // =4
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    ld1rw { z0.s }, p0/z, [x8]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr float, float* %valp, i32 -1
+  %val = load float, float* %valp2
+  %ins = insertelement <vscale x 4 x float> undef, float %val, i32 0
+  %shf = shufflevector <vscale x 4 x float> %ins, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x float> %shf
+}
+
+define <vscale x 2 x float> @ld1rw_float_unpacked2(float* %valp) {
+; CHECK-LABEL: ld1rw_float_unpacked2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rw { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load float, float* %valp
+  %ins = insertelement <vscale x 2 x float> undef, float %val, i32 0
+  %shf = shufflevector <vscale x 2 x float> %ins, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x float> %shf
+}
+
+define <vscale x 2 x float> @ld1rw_float_unpacked2_gep(float* %valp) {
+; CHECK-LABEL: ld1rw_float_unpacked2_gep:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rw { z0.d }, p0/z, [x0, #252]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr float, float* %valp, i32 63
+  %val = load float, float* %valp2
+  %ins = insertelement <vscale x 2 x float> undef, float %val, i32 0
+  %shf = shufflevector <vscale x 2 x float> %ins, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x float> %shf
+}
+
+define <vscale x 2 x float> @ld1rw_float_unpacked2_gep_out_of_range_up(float* %valp) {
+; CHECK-LABEL: ld1rw_float_unpacked2_gep_out_of_range_up:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #256 // =256
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rw { z0.d }, p0/z, [x8]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr float, float* %valp, i32 64
+  %val = load float, float* %valp2
+  %ins = insertelement <vscale x 2 x float> undef, float %val, i32 0
+  %shf = shufflevector <vscale x 2 x float> %ins, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x float> %shf
+}
+
+define <vscale x 2 x float> @ld1rw_float_unpacked2_gep_out_of_range_down(float* %valp) {
+; CHECK-LABEL: ld1rw_float_unpacked2_gep_out_of_range_down:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub x8, x0, #4 // =4
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rw { z0.d }, p0/z, [x8]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr float, float* %valp, i32 -1
+  %val = load float, float* %valp2
+  %ins = insertelement <vscale x 2 x float> undef, float %val, i32 0
+  %shf = shufflevector <vscale x 2 x float> %ins, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x float> %shf
+}
+
+define <vscale x 2 x double> @ld1rd_double(double* %valp) {
+; CHECK-LABEL: ld1rd_double:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x0]
+; CHECK-NEXT:    ret
+  %val = load double, double* %valp
+  %ins = insertelement <vscale x 2 x double> undef, double %val, i32 0
+  %shf = shufflevector <vscale x 2 x double> %ins, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x double> %shf
+}
+
+define <vscale x 2 x double> @ld1rd_double_gep(double* %valp) {
+; CHECK-LABEL: ld1rd_double_gep:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x0, #504]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr double, double* %valp, i32 63
+  %val = load double, double* %valp2
+  %ins = insertelement <vscale x 2 x double> undef, double %val, i32 0
+  %shf = shufflevector <vscale x 2 x double> %ins, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x double> %shf
+}
+
+define <vscale x 2 x double> @ld1rd_double_gep_out_of_range_up(double* %valp) {
+; CHECK-LABEL: ld1rd_double_gep_out_of_range_up:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    add x8, x0, #512 // =512
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x8]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr double, double* %valp, i32 64
+  %val = load double, double* %valp2
+  %ins = insertelement <vscale x 2 x double> undef, double %val, i32 0
+  %shf = shufflevector <vscale x 2 x double> %ins, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x double> %shf
+}
+
+define <vscale x 2 x double> @ld1rd_double_gep_out_of_range_down(double* %valp) {
+; CHECK-LABEL: ld1rd_double_gep_out_of_range_down:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub x8, x0, #8 // =8
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x8]
+; CHECK-NEXT:    ret
+  %valp2 = getelementptr double, double* %valp, i32 -1
+  %val = load double, double* %valp2
+  %ins = insertelement <vscale x 2 x double> undef, double %val, i32 0
+  %shf = shufflevector <vscale x 2 x double> %ins, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x double> %shf
+}
--- a/test/CodeGen/AArch64/sve-ld1r.mir
+++ b/test/CodeGen/AArch64/sve-ld1r.mir
@ -0,0 +1,217 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -run-pass=prologepilog -simplify-mir -verify-machineinstrs %s -o - | FileCheck %s
+#
+# Test that prologepilog works for each of the LD1R instructions for stack-based objects.
+#
+--- |
+  define void @testcase_positive_offset() {
+    %dummy = alloca i64, align 8
+    %object = alloca i64, align 8
+    ; Reads from %object at offset 63 * readsize
+    ret void
+  }
+  define void @testcase_positive_offset_out_of_range() {
+    %dummy = alloca i64, align 8
+    %object = alloca i64, align 8
+    ; Reads from %object at offset 64 * readsize
+    ret void
+  }
+  define void @testcase_negative_offset_out_of_range() {
+    %dummy = alloca i64, align 8
+    %object = alloca i64, align 8
+    ; Reads from %object at offset -1 * readsize
+    ret void
+  }
+...
+---
+name:            testcase_positive_offset
+tracksRegLiveness: true
+stack:
+  - { id: 0, name: dummy, type: default, offset: 0, size: 8, alignment: 8 }
+  - { id: 1, name: object, type: default, offset: 0, size: 8, alignment: 8 }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $p0
+
+    ; CHECK-LABEL: name: testcase_positive_offset
+    ; CHECK: liveins: $p0
+    ; CHECK: $sp = frame-setup SUBXri $sp, 16, 0
+    ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset <mcsymbol >16
+    ; CHECK: renamable $z0 = LD1RB_IMM renamable $p0, $sp, 63 :: (load (s8) from %ir.object, align 2)
+    ; CHECK: renamable $z0 = LD1RB_H_IMM renamable $p0, $sp, 63 :: (load (s8) from %ir.object, align 2)
+    ; CHECK: renamable $z0 = LD1RB_S_IMM renamable $p0, $sp, 63 :: (load (s8) from %ir.object, align 2)
+    ; CHECK: renamable $z0 = LD1RB_D_IMM renamable $p0, $sp, 63 :: (load (s8) from %ir.object, align 2)
+    ; CHECK: renamable $z0 = LD1RSB_H_IMM renamable $p0, $sp, 63 :: (load (s8) from %ir.object, align 2)
+    ; CHECK: renamable $z0 = LD1RSB_S_IMM renamable $p0, $sp, 63 :: (load (s8) from %ir.object, align 2)
+    ; CHECK: renamable $z0 = LD1RSB_D_IMM renamable $p0, $sp, 63 :: (load (s8) from %ir.object, align 2)
+    ; CHECK: renamable $z0 = LD1RH_IMM renamable $p0, $sp, 63 :: (load (s16) from %ir.object)
+    ; CHECK: renamable $z0 = LD1RH_S_IMM renamable $p0, $sp, 63 :: (load (s16) from %ir.object)
+    ; CHECK: renamable $z0 = LD1RH_D_IMM renamable $p0, $sp, 63 :: (load (s16) from %ir.object)
+    ; CHECK: renamable $z0 = LD1RSH_S_IMM renamable $p0, $sp, 63 :: (load (s16) from %ir.object)
+    ; CHECK: renamable $z0 = LD1RSH_D_IMM renamable $p0, $sp, 63 :: (load (s16) from %ir.object)
+    ; CHECK: renamable $z0 = LD1RW_IMM renamable $p0, $sp, 63 :: (load (s32) from %ir.object)
+    ; CHECK: renamable $z0 = LD1RW_D_IMM renamable $p0, $sp, 63 :: (load (s32) from %ir.object)
+    ; CHECK: renamable $z0 = LD1RSW_IMM renamable $p0, $sp, 63 :: (load (s32) from %ir.object)
+    ; CHECK: renamable $z0 = LD1RD_IMM renamable $p0, $sp, 63 :: (load (s64) from %ir.object)
+    ; CHECK: renamable $z0 = LD1RD_IMM renamable $p0, $sp, 63 :: (load (s64) from %ir.object)
+    ; CHECK: $sp = frame-destroy ADDXri $sp, 16, 0
+    ; CHECK: RET_ReallyLR implicit $z0
+    renamable $z0 = LD1RB_IMM renamable $p0, %stack.1.object, 63 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LD1RB_H_IMM renamable $p0, %stack.1.object, 63 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LD1RB_S_IMM renamable $p0, %stack.1.object, 63 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LD1RB_D_IMM renamable $p0, %stack.1.object, 63 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LD1RSB_H_IMM renamable $p0, %stack.1.object, 63 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LD1RSB_S_IMM renamable $p0, %stack.1.object, 63 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LD1RSB_D_IMM renamable $p0, %stack.1.object, 63 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LD1RH_IMM renamable $p0, %stack.1.object, 63 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LD1RH_S_IMM renamable $p0, %stack.1.object, 63 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LD1RH_D_IMM renamable $p0, %stack.1.object, 63 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LD1RSH_S_IMM renamable $p0, %stack.1.object, 63 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LD1RSH_D_IMM renamable $p0, %stack.1.object, 63 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LD1RW_IMM renamable $p0, %stack.1.object, 63 :: (load 4 from %ir.object, align 4)
+    renamable $z0 = LD1RW_D_IMM renamable $p0, %stack.1.object, 63 :: (load 4 from %ir.object, align 4)
+    renamable $z0 = LD1RSW_IMM renamable $p0, %stack.1.object, 63 :: (load 4 from %ir.object, align 4)
+    renamable $z0 = LD1RD_IMM renamable $p0, %stack.1.object, 63 :: (load 8 from %ir.object, align 8)
+    renamable $z0 = LD1RD_IMM renamable $p0, %stack.1.object, 63 :: (load 8 from %ir.object, align 8)
+    RET_ReallyLR implicit $z0
+...
+---
+name:            testcase_positive_offset_out_of_range
+tracksRegLiveness: true
+stack:
+  - { id: 0, name: dummy, type: default, offset: 0, size: 8, alignment: 8 }
+  - { id: 1, name: object, type: default, offset: 0, size: 8, alignment: 8 }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $p0
+
+    ; CHECK-LABEL: name: testcase_positive_offset_out_of_range
+    ; CHECK: liveins: $p0
+    ; CHECK: $sp = frame-setup SUBXri $sp, 16, 0
+    ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset <mcsymbol >16
+    ; CHECK: $x8 = ADDXri $sp, 1, 0
+    ; CHECK: renamable $z0 = LD1RB_IMM renamable $p0, killed $x8, 63 :: (load (s8) from %ir.object, align 2)
+    ; CHECK: $x8 = ADDXri $sp, 1, 0
+    ; CHECK: renamable $z0 = LD1RB_H_IMM renamable $p0, killed $x8, 63 :: (load (s8) from %ir.object, align 2)
+    ; CHECK: $x8 = ADDXri $sp, 1, 0
+    ; CHECK: renamable $z0 = LD1RB_S_IMM renamable $p0, killed $x8, 63 :: (load (s8) from %ir.object, align 2)
+    ; CHECK: $x8 = ADDXri $sp, 1, 0
+    ; CHECK: renamable $z0 = LD1RB_D_IMM renamable $p0, killed $x8, 63 :: (load (s8) from %ir.object, align 2)
+    ; CHECK: $x8 = ADDXri $sp, 1, 0
+    ; CHECK: renamable $z0 = LD1RSB_H_IMM renamable $p0, killed $x8, 63 :: (load (s8) from %ir.object, align 2)
+    ; CHECK: $x8 = ADDXri $sp, 1, 0
+    ; CHECK: renamable $z0 = LD1RSB_S_IMM renamable $p0, killed $x8, 63 :: (load (s8) from %ir.object, align 2)
+    ; CHECK: $x8 = ADDXri $sp, 1, 0
+    ; CHECK: renamable $z0 = LD1RSB_D_IMM renamable $p0, killed $x8, 63 :: (load (s8) from %ir.object, align 2)
+    ; CHECK: $x8 = ADDXri $sp, 2, 0
+    ; CHECK: renamable $z0 = LD1RH_IMM renamable $p0, killed $x8, 63 :: (load (s16) from %ir.object)
+    ; CHECK: $x8 = ADDXri $sp, 2, 0
+    ; CHECK: renamable $z0 = LD1RH_S_IMM renamable $p0, killed $x8, 63 :: (load (s16) from %ir.object)
+    ; CHECK: $x8 = ADDXri $sp, 2, 0
+    ; CHECK: renamable $z0 = LD1RH_D_IMM renamable $p0, killed $x8, 63 :: (load (s16) from %ir.object)
+    ; CHECK: $x8 = ADDXri $sp, 2, 0
+    ; CHECK: renamable $z0 = LD1RSH_S_IMM renamable $p0, killed $x8, 63 :: (load (s16) from %ir.object)
+    ; CHECK: $x8 = ADDXri $sp, 2, 0
+    ; CHECK: renamable $z0 = LD1RSH_D_IMM renamable $p0, killed $x8, 63 :: (load (s16) from %ir.object)
+    ; CHECK: $x8 = ADDXri $sp, 4, 0
+    ; CHECK: renamable $z0 = LD1RW_IMM renamable $p0, killed $x8, 63 :: (load (s32) from %ir.object)
+    ; CHECK: $x8 = ADDXri $sp, 4, 0
+    ; CHECK: renamable $z0 = LD1RW_D_IMM renamable $p0, killed $x8, 63 :: (load (s32) from %ir.object)
+    ; CHECK: $x8 = ADDXri $sp, 4, 0
+    ; CHECK: renamable $z0 = LD1RSW_IMM renamable $p0, killed $x8, 63 :: (load (s32) from %ir.object)
+    ; CHECK: $x8 = ADDXri $sp, 8, 0
+    ; CHECK: renamable $z0 = LD1RD_IMM renamable $p0, killed $x8, 63 :: (load (s64) from %ir.object)
+    ; CHECK: $x8 = ADDXri $sp, 8, 0
+    ; CHECK: renamable $z0 = LD1RD_IMM renamable $p0, killed $x8, 63 :: (load (s64) from %ir.object)
+    ; CHECK: $sp = frame-destroy ADDXri $sp, 16, 0
+    ; CHECK: RET_ReallyLR implicit $z0
+    renamable $z0 = LD1RB_IMM renamable $p0, %stack.1.object, 64 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LD1RB_H_IMM renamable $p0, %stack.1.object, 64 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LD1RB_S_IMM renamable $p0, %stack.1.object, 64 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LD1RB_D_IMM renamable $p0, %stack.1.object, 64 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LD1RSB_H_IMM renamable $p0, %stack.1.object, 64 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LD1RSB_S_IMM renamable $p0, %stack.1.object, 64 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LD1RSB_D_IMM renamable $p0, %stack.1.object, 64 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LD1RH_IMM renamable $p0, %stack.1.object, 64 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LD1RH_S_IMM renamable $p0, %stack.1.object, 64 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LD1RH_D_IMM renamable $p0, %stack.1.object, 64 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LD1RSH_S_IMM renamable $p0, %stack.1.object, 64 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LD1RSH_D_IMM renamable $p0, %stack.1.object, 64 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LD1RW_IMM renamable $p0, %stack.1.object, 64 :: (load 4 from %ir.object, align 4)
+    renamable $z0 = LD1RW_D_IMM renamable $p0, %stack.1.object, 64 :: (load 4 from %ir.object, align 4)
+    renamable $z0 = LD1RSW_IMM renamable $p0, %stack.1.object, 64 :: (load 4 from %ir.object, align 4)
+    renamable $z0 = LD1RD_IMM renamable $p0, %stack.1.object, 64 :: (load 8 from %ir.object, align 8)
+    renamable $z0 = LD1RD_IMM renamable $p0, %stack.1.object, 64 :: (load 8 from %ir.object, align 8)
+    RET_ReallyLR implicit $z0
+...
+
+...
+---
+name:            testcase_negative_offset_out_of_range
+tracksRegLiveness: true
+stack:
+  - { id: 0, name: dummy, type: default, offset: 0, size: 8, alignment: 8 }
+  - { id: 1, name: object, type: default, offset: 0, size: 8, alignment: 8 }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: $p0
+
+    ; CHECK-LABEL: name: testcase_negative_offset_out_of_range
+    ; CHECK: liveins: $p0
+    ; CHECK: $sp = frame-setup SUBXri $sp, 16, 0
+    ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset <mcsymbol >16
+    ; CHECK: $x8 = SUBXri $sp, 1, 0
+    ; CHECK: renamable $z0 = LD1RB_IMM renamable $p0, killed $x8, 0 :: (load (s8) from %ir.object, align 2)
+    ; CHECK: $x8 = SUBXri $sp, 1, 0
+    ; CHECK: renamable $z0 = LD1RB_H_IMM renamable $p0, killed $x8, 0 :: (load (s8) from %ir.object, align 2)
+    ; CHECK: $x8 = SUBXri $sp, 1, 0
+    ; CHECK: renamable $z0 = LD1RB_S_IMM renamable $p0, killed $x8, 0 :: (load (s8) from %ir.object, align 2)
+    ; CHECK: $x8 = SUBXri $sp, 1, 0
+    ; CHECK: renamable $z0 = LD1RB_D_IMM renamable $p0, killed $x8, 0 :: (load (s8) from %ir.object, align 2)
+    ; CHECK: $x8 = SUBXri $sp, 1, 0
+    ; CHECK: renamable $z0 = LD1RSB_H_IMM renamable $p0, killed $x8, 0 :: (load (s8) from %ir.object, align 2)
+    ; CHECK: $x8 = SUBXri $sp, 1, 0
+    ; CHECK: renamable $z0 = LD1RSB_S_IMM renamable $p0, killed $x8, 0 :: (load (s8) from %ir.object, align 2)
+    ; CHECK: $x8 = SUBXri $sp, 1, 0
+    ; CHECK: renamable $z0 = LD1RSB_D_IMM renamable $p0, killed $x8, 0 :: (load (s8) from %ir.object, align 2)
+    ; CHECK: $x8 = SUBXri $sp, 2, 0
+    ; CHECK: renamable $z0 = LD1RH_IMM renamable $p0, killed $x8, 0 :: (load (s16) from %ir.object)
+    ; CHECK: $x8 = SUBXri $sp, 2, 0
+    ; CHECK: renamable $z0 = LD1RH_S_IMM renamable $p0, killed $x8, 0 :: (load (s16) from %ir.object)
+    ; CHECK: $x8 = SUBXri $sp, 2, 0
+    ; CHECK: renamable $z0 = LD1RH_D_IMM renamable $p0, killed $x8, 0 :: (load (s16) from %ir.object)
+    ; CHECK: $x8 = SUBXri $sp, 2, 0
+    ; CHECK: renamable $z0 = LD1RSH_S_IMM renamable $p0, killed $x8, 0 :: (load (s16) from %ir.object)
+    ; CHECK: $x8 = SUBXri $sp, 2, 0
+    ; CHECK: renamable $z0 = LD1RSH_D_IMM renamable $p0, killed $x8, 0 :: (load (s16) from %ir.object)
+    ; CHECK: $x8 = SUBXri $sp, 4, 0
+    ; CHECK: renamable $z0 = LD1RW_IMM renamable $p0, killed $x8, 0 :: (load (s32) from %ir.object)
+    ; CHECK: $x8 = SUBXri $sp, 4, 0
+    ; CHECK: renamable $z0 = LD1RW_D_IMM renamable $p0, killed $x8, 0 :: (load (s32) from %ir.object)
+    ; CHECK: $x8 = SUBXri $sp, 4, 0
+    ; CHECK: renamable $z0 = LD1RSW_IMM renamable $p0, killed $x8, 0 :: (load (s32) from %ir.object)
+    ; CHECK: $x8 = SUBXri $sp, 8, 0
+    ; CHECK: renamable $z0 = LD1RD_IMM renamable $p0, killed $x8, 0 :: (load (s64) from %ir.object)
+    ; CHECK: $x8 = SUBXri $sp, 8, 0
+    ; CHECK: renamable $z0 = LD1RD_IMM renamable $p0, killed $x8, 0 :: (load (s64) from %ir.object)
+    ; CHECK: $sp = frame-destroy ADDXri $sp, 16, 0
+    ; CHECK: RET_ReallyLR implicit $z0
+    renamable $z0 = LD1RB_IMM renamable $p0, %stack.1.object, -1 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LD1RB_H_IMM renamable $p0, %stack.1.object, -1 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LD1RB_S_IMM renamable $p0, %stack.1.object, -1 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LD1RB_D_IMM renamable $p0, %stack.1.object, -1 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LD1RSB_H_IMM renamable $p0, %stack.1.object, -1 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LD1RSB_S_IMM renamable $p0, %stack.1.object, -1 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LD1RSB_D_IMM renamable $p0, %stack.1.object, -1 :: (load 1 from %ir.object, align 2)
+    renamable $z0 = LD1RH_IMM renamable $p0, %stack.1.object, -1 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LD1RH_S_IMM renamable $p0, %stack.1.object, -1 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LD1RH_D_IMM renamable $p0, %stack.1.object, -1 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LD1RSH_S_IMM renamable $p0, %stack.1.object, -1 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LD1RSH_D_IMM renamable $p0, %stack.1.object, -1 :: (load 2 from %ir.object, align 2)
+    renamable $z0 = LD1RW_IMM renamable $p0, %stack.1.object, -1 :: (load 4 from %ir.object, align 4)
+    renamable $z0 = LD1RW_D_IMM renamable $p0, %stack.1.object, -1 :: (load 4 from %ir.object, align 4)
+    renamable $z0 = LD1RSW_IMM renamable $p0, %stack.1.object, -1 :: (load 4 from %ir.object, align 4)
+    renamable $z0 = LD1RD_IMM renamable $p0, %stack.1.object, -1 :: (load 8 from %ir.object, align 8)
+    renamable $z0 = LD1RD_IMM renamable $p0, %stack.1.object, -1 :: (load 8 from %ir.object, align 8)
+    RET_ReallyLR implicit $z0
+...
--- a/test/CodeGen/AArch64/sve-vector-splat.ll
+++ b/test/CodeGen/AArch64/sve-vector-splat.ll
@ -1,74 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s

 ;; Splats of legal integer vector types

 define <vscale x 16 x i8> @sve_splat_16xi8(i8 %val) {
-; CHECK-LABEL: @sve_splat_16xi8
-; CHECK: mov z0.b, w0
-; CHECK-NEXT: ret
+; CHECK-LABEL: sve_splat_16xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.b, w0
+; CHECK-NEXT:    ret
  %ins = insertelement <vscale x 16 x i8> undef, i8 %val, i32 0
  %splat = shufflevector <vscale x 16 x i8> %ins, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
  ret <vscale x 16 x i8> %splat
 }

 define <vscale x 8 x i16> @sve_splat_8xi16(i16 %val) {
-; CHECK-LABEL: @sve_splat_8xi16
-; CHECK: mov z0.h, w0
-; CHECK-NEXT: ret
+; CHECK-LABEL: sve_splat_8xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, w0
+; CHECK-NEXT:    ret
  %ins = insertelement <vscale x 8 x i16> undef, i16 %val, i32 0
  %splat = shufflevector <vscale x 8 x i16> %ins, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
  ret <vscale x 8 x i16> %splat
 }

 define <vscale x 4 x i32> @sve_splat_4xi32(i32 %val) {
-; CHECK-LABEL: @sve_splat_4xi32
-; CHECK: mov z0.s, w0
-; CHECK-NEXT: ret
+; CHECK-LABEL: sve_splat_4xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, w0
+; CHECK-NEXT:    ret
  %ins = insertelement <vscale x 4 x i32> undef, i32 %val, i32 0
  %splat = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
  ret <vscale x 4 x i32> %splat
 }

 define <vscale x 2 x i64> @sve_splat_2xi64(i64 %val) {
-; CHECK-LABEL: @sve_splat_2xi64
-; CHECK: mov z0.d, x0
-; CHECK-NEXT: ret
+; CHECK-LABEL: sve_splat_2xi64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, x0
+; CHECK-NEXT:    ret
  %ins = insertelement <vscale x 2 x i64> undef, i64 %val, i32 0
  %splat = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
  ret <vscale x 2 x i64> %splat
 }

 define <vscale x 16 x i8> @sve_splat_16xi8_imm() {
-; CHECK-LABEL: @sve_splat_16xi8_imm
-; CHECK: mov z0.b, #1
-; CHECK-NEXT: ret
+; CHECK-LABEL: sve_splat_16xi8_imm:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.b, #1 // =0x1
+; CHECK-NEXT:    ret
  %ins = insertelement <vscale x 16 x i8> undef, i8 1, i32 0
  %splat = shufflevector <vscale x 16 x i8> %ins, <vscale x 16 x i8> undef, <vscale x 16 x i32> zeroinitializer
  ret <vscale x 16 x i8> %splat
 }

 define <vscale x 8 x i16> @sve_splat_8xi16_imm() {
-; CHECK-LABEL: @sve_splat_8xi16_imm
-; CHECK: mov z0.h, #1
-; CHECK-NEXT: ret
+; CHECK-LABEL: sve_splat_8xi16_imm:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, #1 // =0x1
+; CHECK-NEXT:    ret
  %ins = insertelement <vscale x 8 x i16> undef, i16 1, i32 0
  %splat = shufflevector <vscale x 8 x i16> %ins, <vscale x 8 x i16> undef, <vscale x 8 x i32> zeroinitializer
  ret <vscale x 8 x i16> %splat
 }

 define <vscale x 4 x i32> @sve_splat_4xi32_imm() {
-; CHECK-LABEL: @sve_splat_4xi32_imm
-; CHECK: mov z0.s, #1
-; CHECK-NEXT: ret
+; CHECK-LABEL: sve_splat_4xi32_imm:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #1 // =0x1
+; CHECK-NEXT:    ret
  %ins = insertelement <vscale x 4 x i32> undef, i32 1, i32 0
  %splat = shufflevector <vscale x 4 x i32> %ins, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
  ret <vscale x 4 x i32> %splat
 }

 define <vscale x 2 x i64> @sve_splat_2xi64_imm() {
-; CHECK-LABEL: @sve_splat_2xi64_imm
-; CHECK: mov z0.d, #1
-; CHECK-NEXT: ret
+; CHECK-LABEL: sve_splat_2xi64_imm:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #1 // =0x1
+; CHECK-NEXT:    ret
  %ins = insertelement <vscale x 2 x i64> undef, i64 1, i32 0
  %splat = shufflevector <vscale x 2 x i64> %ins, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
  ret <vscale x 2 x i64> %splat
@ -77,54 +86,63 @@ define <vscale x 2 x i64> @sve_splat_2xi64_imm() {
 ;; Promote splats of smaller illegal integer vector types

 define <vscale x 2 x i8> @sve_splat_2xi8(i8 %val) {
-; CHECK-LABEL: @sve_splat_2xi8
-; CHECK: mov z0.d, x0
-; CHECK-NEXT: ret
+; CHECK-LABEL: sve_splat_2xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    mov z0.d, x0
+; CHECK-NEXT:    ret
  %ins = insertelement <vscale x 2 x i8> undef, i8 %val, i32 0
  %splat = shufflevector <vscale x 2 x i8> %ins, <vscale x 2 x i8> undef, <vscale x 2 x i32> zeroinitializer
  ret <vscale x 2 x i8> %splat
 }

 define <vscale x 4 x i8> @sve_splat_4xi8(i8 %val) {
-; CHECK-LABEL: @sve_splat_4xi8
-; CHECK: mov z0.s, w0
-; CHECK-NEXT: ret
+; CHECK-LABEL: sve_splat_4xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, w0
+; CHECK-NEXT:    ret
  %ins = insertelement <vscale x 4 x i8> undef, i8 %val, i32 0
  %splat = shufflevector <vscale x 4 x i8> %ins, <vscale x 4 x i8> undef, <vscale x 4 x i32> zeroinitializer
  ret <vscale x 4 x i8> %splat
 }

 define <vscale x 8 x i8> @sve_splat_8xi8(i8 %val) {
-; CHECK-LABEL: @sve_splat_8xi8
-; CHECK: mov z0.h, w0
-; CHECK-NEXT: ret
+; CHECK-LABEL: sve_splat_8xi8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, w0
+; CHECK-NEXT:    ret
  %ins = insertelement <vscale x 8 x i8> undef, i8 %val, i32 0
  %splat = shufflevector <vscale x 8 x i8> %ins, <vscale x 8 x i8> undef, <vscale x 8 x i32> zeroinitializer
  ret <vscale x 8 x i8> %splat
 }

 define <vscale x 2 x i16> @sve_splat_2xi16(i16 %val) {
-; CHECK-LABEL: @sve_splat_2xi16
-; CHECK: mov z0.d, x0
-; CHECK-NEXT: ret
+; CHECK-LABEL: sve_splat_2xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    mov z0.d, x0
+; CHECK-NEXT:    ret
  %ins = insertelement <vscale x 2 x i16> undef, i16 %val, i32 0
  %splat = shufflevector <vscale x 2 x i16> %ins, <vscale x 2 x i16> undef, <vscale x 2 x i32> zeroinitializer
  ret <vscale x 2 x i16> %splat
 }

 define <vscale x 4 x i16> @sve_splat_4xi16(i16 %val) {
-; CHECK-LABEL: @sve_splat_4xi16
-; CHECK: mov z0.s, w0
-; CHECK-NEXT: ret
+; CHECK-LABEL: sve_splat_4xi16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, w0
+; CHECK-NEXT:    ret
  %ins = insertelement <vscale x 4 x i16> undef, i16 %val, i32 0
  %splat = shufflevector <vscale x 4 x i16> %ins, <vscale x 4 x i16> undef, <vscale x 4 x i32> zeroinitializer
  ret <vscale x 4 x i16> %splat
 }

 define <vscale x 2 x i32> @sve_splat_2xi32(i32 %val) {
-; CHECK-LABEL: @sve_splat_2xi32
-; CHECK: mov z0.d, x0
-; CHECK-NEXT: ret
+; CHECK-LABEL: sve_splat_2xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    mov z0.d, x0
+; CHECK-NEXT:    ret
  %ins = insertelement <vscale x 2 x i32> undef, i32 %val, i32 0
  %splat = shufflevector <vscale x 2 x i32> %ins, <vscale x 2 x i32> undef, <vscale x 2 x i32> zeroinitializer
  ret <vscale x 2 x i32> %splat
@ -134,8 +152,9 @@ define <vscale x 2 x i32> @sve_splat_2xi32(i32 %val) {

 define <vscale x 1 x i32> @sve_splat_1xi32(i32 %val) {
 ; CHECK-LABEL: sve_splat_1xi32:
-; CHECK:       mov z0.s, w0
-; CHECK-NEXT:  ret
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z0.s, w0
+; CHECK-NEXT:    ret
 entry:
  %ins = insertelement <vscale x 1 x i32> undef, i32 %val, i32 0
  %splat = shufflevector <vscale x 1 x i32> %ins, <vscale x 1 x i32> undef, <vscale x 1 x i32> zeroinitializer
@ -143,51 +162,60 @@ entry:
 }

 define <vscale x 12 x i32> @sve_splat_12xi32(i32 %val) {
-; CHECK-LABEL: @sve_splat_12xi32
-; CHECK: mov z0.s, w0
-; CHECK-NEXT: mov z1.d, z0.d
-; CHECK-NEXT: mov z2.d, z0.d
-; CHECK-NEXT: ret
+; CHECK-LABEL: sve_splat_12xi32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, w0
+; CHECK-NEXT:    mov z1.d, z0.d
+; CHECK-NEXT:    mov z2.d, z0.d
+; CHECK-NEXT:    ret
  %ins = insertelement <vscale x 12 x i32> undef, i32 %val, i32 0
  %splat = shufflevector <vscale x 12 x i32> %ins, <vscale x 12 x i32> undef, <vscale x 12 x i32> zeroinitializer
  ret <vscale x 12 x i32> %splat
 }

 define <vscale x 2 x i1> @sve_splat_2xi1(i1 %val) {
-; CHECK-LABEL: @sve_splat_2xi1
-; CHECK: sbfx x8, x0, #0, #1
-; CHECK-NEXT: whilelo p0.d, xzr, x8
-; CHECK-NEXT: ret
+; CHECK-LABEL: sve_splat_2xi1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sbfx x8, x0, #0, #1
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    ret
  %ins = insertelement <vscale x 2 x i1> undef, i1 %val, i32 0
  %splat = shufflevector <vscale x 2 x i1> %ins, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
  ret <vscale x 2 x i1> %splat
 }

 define <vscale x 4 x i1> @sve_splat_4xi1(i1 %val) {
-; CHECK-LABEL: @sve_splat_4xi1
-; CHECK: sbfx x8, x0, #0, #1
-; CHECK-NEXT: whilelo p0.s, xzr, x8
-; CHECK-NEXT: ret
+; CHECK-LABEL: sve_splat_4xi1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sbfx x8, x0, #0, #1
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    ret
  %ins = insertelement <vscale x 4 x i1> undef, i1 %val, i32 0
  %splat = shufflevector <vscale x 4 x i1> %ins, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
  ret <vscale x 4 x i1> %splat
 }

 define <vscale x 8 x i1> @sve_splat_8xi1(i1 %val) {
-; CHECK-LABEL: @sve_splat_8xi1
-; CHECK: sbfx x8, x0, #0, #1
-; CHECK-NEXT: whilelo p0.h, xzr, x8
-; CHECK-NEXT: ret
+; CHECK-LABEL: sve_splat_8xi1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sbfx x8, x0, #0, #1
+; CHECK-NEXT:    whilelo p0.h, xzr, x8
+; CHECK-NEXT:    ret
  %ins = insertelement <vscale x 8 x i1> undef, i1 %val, i32 0
  %splat = shufflevector <vscale x 8 x i1> %ins, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
  ret <vscale x 8 x i1> %splat
 }

 define <vscale x 16 x i1> @sve_splat_16xi1(i1 %val) {
-; CHECK-LABEL: @sve_splat_16xi1
-; CHECK: sbfx x8, x0, #0, #1
-; CHECK-NEXT: whilelo p0.b, xzr, x8
-; CHECK-NEXT: ret
+; CHECK-LABEL: sve_splat_16xi1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w0 killed $w0 def $x0
+; CHECK-NEXT:    sbfx x8, x0, #0, #1
+; CHECK-NEXT:    whilelo p0.b, xzr, x8
+; CHECK-NEXT:    ret
  %ins = insertelement <vscale x 16 x i1> undef, i1 %val, i32 0
  %splat = shufflevector <vscale x 16 x i1> %ins, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
  ret <vscale x 16 x i1> %splat
@ -197,8 +225,10 @@ define <vscale x 16 x i1> @sve_splat_16xi1(i1 %val) {

 define <vscale x 8 x bfloat> @splat_nxv8bf16(bfloat %val) #0 {
 ; CHECK-LABEL: splat_nxv8bf16:
-; CHECK: mov z0.h, h0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
  %1 = insertelement <vscale x 8 x bfloat> undef, bfloat %val, i32 0
  %2 = shufflevector <vscale x 8 x bfloat> %1, <vscale x 8 x bfloat> undef, <vscale x 8 x i32> zeroinitializer
  ret <vscale x 8 x bfloat> %2
@ -206,8 +236,10 @@ define <vscale x 8 x bfloat> @splat_nxv8bf16(bfloat %val) #0 {

 define <vscale x 8 x half> @splat_nxv8f16(half %val) {
 ; CHECK-LABEL: splat_nxv8f16:
-; CHECK: mov z0.h, h0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
  %1 = insertelement <vscale x 8 x half> undef, half %val, i32 0
  %2 = shufflevector <vscale x 8 x half> %1, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
  ret <vscale x 8 x half> %2
@ -215,8 +247,10 @@ define <vscale x 8 x half> @splat_nxv8f16(half %val) {

 define <vscale x 4 x half> @splat_nxv4f16(half %val) {
 ; CHECK-LABEL: splat_nxv4f16:
-; CHECK: mov z0.h, h0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
  %1 = insertelement <vscale x 4 x half> undef, half %val, i32 0
  %2 = shufflevector <vscale x 4 x half> %1, <vscale x 4 x half> undef, <vscale x 4 x i32> zeroinitializer
  ret <vscale x 4 x half> %2
@ -224,8 +258,10 @@ define <vscale x 4 x half> @splat_nxv4f16(half %val) {

 define <vscale x 2 x half> @splat_nxv2f16(half %val) {
 ; CHECK-LABEL: splat_nxv2f16:
-; CHECK: mov z0.h, h0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $h0 killed $h0 def $z0
+; CHECK-NEXT:    mov z0.h, h0
+; CHECK-NEXT:    ret
  %1 = insertelement <vscale x 2 x half> undef, half %val, i32 0
  %2 = shufflevector <vscale x 2 x half> %1, <vscale x 2 x half> undef, <vscale x 2 x i32> zeroinitializer
  ret <vscale x 2 x half> %2
@ -233,8 +269,10 @@ define <vscale x 2 x half> @splat_nxv2f16(half %val) {

 define <vscale x 4 x float> @splat_nxv4f32(float %val) {
 ; CHECK-LABEL: splat_nxv4f32:
-; CHECK: mov z0.s, s0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    ret
  %1 = insertelement <vscale x 4 x float> undef, float %val, i32 0
  %2 = shufflevector <vscale x 4 x float> %1, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
  ret <vscale x 4 x float> %2
@ -242,8 +280,10 @@ define <vscale x 4 x float> @splat_nxv4f32(float %val) {

 define <vscale x 2 x float> @splat_nxv2f32(float %val) {
 ; CHECK-LABEL: splat_nxv2f32:
-; CHECK: mov z0.s, s0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $z0
+; CHECK-NEXT:    mov z0.s, s0
+; CHECK-NEXT:    ret
  %1 = insertelement <vscale x 2 x float> undef, float %val, i32 0
  %2 = shufflevector <vscale x 2 x float> %1, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
  ret <vscale x 2 x float> %2
@ -251,8 +291,10 @@ define <vscale x 2 x float> @splat_nxv2f32(float %val) {

 define <vscale x 2 x double> @splat_nxv2f64(double %val) {
 ; CHECK-LABEL: splat_nxv2f64:
-; CHECK: mov z0.d, d0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
+; CHECK-NEXT:    mov z0.d, d0
+; CHECK-NEXT:    ret
  %1 = insertelement <vscale x 2 x double> undef, double %val, i32 0
  %2 = shufflevector <vscale x 2 x double> %1, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
  ret <vscale x 2 x double> %2
@ -260,57 +302,65 @@ define <vscale x 2 x double> @splat_nxv2f64(double %val) {

 define <vscale x 8 x half> @splat_nxv8f16_zero() {
 ; CHECK-LABEL: splat_nxv8f16_zero:
-; CHECK: mov z0.h, #0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    ret
  ret <vscale x 8 x half> zeroinitializer
 }

 define <vscale x 8 x bfloat> @splat_nxv8bf16_zero() #0 {
 ; CHECK-LABEL: splat_nxv8bf16_zero:
-; CHECK: mov z0.h, #0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    ret
  ret <vscale x 8 x bfloat> zeroinitializer
 }

 define <vscale x 4 x half> @splat_nxv4f16_zero() {
 ; CHECK-LABEL: splat_nxv4f16_zero:
-; CHECK: mov z0.h, #0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    ret
  ret <vscale x 4 x half> zeroinitializer
 }

 define <vscale x 2 x half> @splat_nxv2f16_zero() {
 ; CHECK-LABEL: splat_nxv2f16_zero:
-; CHECK: mov z0.h, #0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.h, #0 // =0x0
+; CHECK-NEXT:    ret
  ret <vscale x 2 x half> zeroinitializer
 }

 define <vscale x 4 x float> @splat_nxv4f32_zero() {
 ; CHECK-LABEL: splat_nxv4f32_zero:
-; CHECK: mov z0.s, #0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    ret
  ret <vscale x 4 x float> zeroinitializer
 }

 define <vscale x 2 x float> @splat_nxv2f32_zero() {
 ; CHECK-LABEL: splat_nxv2f32_zero:
-; CHECK: mov z0.s, #0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    ret
  ret <vscale x 2 x float> zeroinitializer
 }

 define <vscale x 2 x double> @splat_nxv2f64_zero() {
 ; CHECK-LABEL: splat_nxv2f64_zero:
-; CHECK: mov z0.d, #0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.d, #0 // =0x0
+; CHECK-NEXT:    ret
  ret <vscale x 2 x double> zeroinitializer
 }

 define <vscale x 8 x half> @splat_nxv8f16_imm() {
 ; CHECK-LABEL: splat_nxv8f16_imm:
-; CHECK: fmov z0.h, #1.00000000
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z0.h, #1.00000000
+; CHECK-NEXT:    ret
  %1 = insertelement <vscale x 8 x half> undef, half 1.0, i32 0
  %2 = shufflevector <vscale x 8 x half> %1, <vscale x 8 x half> undef, <vscale x 8 x i32> zeroinitializer
  ret <vscale x 8 x half> %2
@ -318,8 +368,9 @@ define <vscale x 8 x half> @splat_nxv8f16_imm() {

 define <vscale x 4 x half> @splat_nxv4f16_imm() {
 ; CHECK-LABEL: splat_nxv4f16_imm:
-; CHECK: fmov z0.h, #1.00000000
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z0.h, #1.00000000
+; CHECK-NEXT:    ret
  %1 = insertelement <vscale x 4 x half> undef, half 1.0, i32 0
  %2 = shufflevector <vscale x 4 x half> %1, <vscale x 4 x half> undef, <vscale x 4 x i32> zeroinitializer
  ret <vscale x 4 x half> %2
@ -327,8 +378,9 @@ define <vscale x 4 x half> @splat_nxv4f16_imm() {

 define <vscale x 2 x half> @splat_nxv2f16_imm() {
 ; CHECK-LABEL: splat_nxv2f16_imm:
-; CHECK: fmov z0.h, #1.00000000
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z0.h, #1.00000000
+; CHECK-NEXT:    ret
  %1 = insertelement <vscale x 2 x half> undef, half 1.0, i32 0
  %2 = shufflevector <vscale x 2 x half> %1, <vscale x 2 x half> undef, <vscale x 2 x i32> zeroinitializer
  ret <vscale x 2 x half> %2
@ -336,8 +388,9 @@ define <vscale x 2 x half> @splat_nxv2f16_imm() {

 define <vscale x 4 x float> @splat_nxv4f32_imm() {
 ; CHECK-LABEL: splat_nxv4f32_imm:
-; CHECK: fmov z0.s, #1.00000000
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z0.s, #1.00000000
+; CHECK-NEXT:    ret
  %1 = insertelement <vscale x 4 x float> undef, float 1.0, i32 0
  %2 = shufflevector <vscale x 4 x float> %1, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
  ret <vscale x 4 x float> %2
@ -345,8 +398,9 @@ define <vscale x 4 x float> @splat_nxv4f32_imm() {

 define <vscale x 2 x float> @splat_nxv2f32_imm() {
 ; CHECK-LABEL: splat_nxv2f32_imm:
-; CHECK: fmov z0.s, #1.00000000
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z0.s, #1.00000000
+; CHECK-NEXT:    ret
  %1 = insertelement <vscale x 2 x float> undef, float 1.0, i32 0
  %2 = shufflevector <vscale x 2 x float> %1, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
  ret <vscale x 2 x float> %2
@ -354,8 +408,9 @@ define <vscale x 2 x float> @splat_nxv2f32_imm() {

 define <vscale x 2 x double> @splat_nxv2f64_imm() {
 ; CHECK-LABEL: splat_nxv2f64_imm:
-; CHECK: fmov z0.d, #1.00000000
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov z0.d, #1.00000000
+; CHECK-NEXT:    ret
  %1 = insertelement <vscale x 2 x double> undef, double 1.0, i32 0
  %2 = shufflevector <vscale x 2 x double> %1, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
  ret <vscale x 2 x double> %2
@ -363,8 +418,9 @@ define <vscale x 2 x double> @splat_nxv2f64_imm() {

 define <vscale x 4 x i32> @splat_nxv4i32_fold(<vscale x 4 x i32> %x) {
 ; CHECK-LABEL: splat_nxv4i32_fold:
-; CHECK: mov z0.s, #0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    ret
  %r = sub <vscale x 4 x i32> %x, %x
  ret <vscale x 4 x i32> %r
 }
@ -372,38 +428,84 @@ define <vscale x 4 x i32> @splat_nxv4i32_fold(<vscale x 4 x i32> %x) {

 define <vscale x 4 x float> @splat_nxv4f32_fold(<vscale x 4 x float> %x) {
 ; CHECK-LABEL: splat_nxv4f32_fold:
-; CHECK: mov z0.s, #0
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov z0.s, #0 // =0x0
+; CHECK-NEXT:    ret
  %r = fsub nnan <vscale x 4 x float> %x, %x
  ret <vscale x 4 x float> %r
 }

 define <vscale x 2 x float> @splat_nxv2f32_fmov_fold() {
-; CHECK-LABEL: splat_nxv2f32_fmov_fold
-; CHECK: mov w8, #1109917696
-; CHECK-NEXT: mov z0.s, w8
+; CHECK-LABEL: splat_nxv2f32_fmov_fold:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #1109917696
+; CHECK-NEXT:    mov z0.s, w8
+; CHECK-NEXT:    ret
  %1 = insertelement <vscale x 2 x float> undef, float 4.200000e+01, i32 0
  %2 = shufflevector <vscale x 2 x float> %1, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
  ret <vscale x 2 x float> %2
 }

 define <vscale x 4 x float> @splat_nxv4f32_fmov_fold() {
-; CHECK-LABEL: splat_nxv4f32_fmov_fold
-; CHECK: mov w8, #1109917696
-; CHECK-NEXT: mov z0.s, w8
+; CHECK-LABEL: splat_nxv4f32_fmov_fold:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #1109917696
+; CHECK-NEXT:    mov z0.s, w8
+; CHECK-NEXT:    ret
  %1 = insertelement <vscale x 4 x float> undef, float 4.200000e+01, i32 0
  %2 = shufflevector <vscale x 4 x float> %1, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
  ret <vscale x 4 x float> %2
 }

 define <vscale x 2 x double> @splat_nxv2f64_fmov_fold() {
-; CHECK-LABEL: splat_nxv2f64_fmov_fold
-; CHECK: mov x8, #4631107791820423168
-; CHECK-NEXT: mov z0.d, x8
+; CHECK-LABEL: splat_nxv2f64_fmov_fold:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x8, #4631107791820423168
+; CHECK-NEXT:    mov z0.d, x8
+; CHECK-NEXT:    ret
  %1 = insertelement <vscale x 2 x double> undef, double 4.200000e+01, i32 0
  %2 = shufflevector <vscale x 2 x double> %1, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
  ret <vscale x 2 x double> %2
 }

+; Splat of float constants not representable as a single immediate.
+
+define <vscale x 2 x float> @splat_nxv2f32_imm_out_of_range() {
+; CHECK-LABEL: splat_nxv2f32_imm_out_of_range:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #7864
+; CHECK-NEXT:    movk w8, #16469, lsl #16
+; CHECK-NEXT:    mov z0.s, w8
+; CHECK-NEXT:    ret
+  %1 = insertelement <vscale x 2 x float> undef, float 3.3299999237060546875, i32 0
+  %2 = shufflevector <vscale x 2 x float> %1, <vscale x 2 x float> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x float> %2
+}
+
+define <vscale x 4 x float> @splat_nxv4f32_imm_out_of_range() {
+; CHECK-LABEL: splat_nxv4f32_imm_out_of_range:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w8, #7864
+; CHECK-NEXT:    movk w8, #16469, lsl #16
+; CHECK-NEXT:    mov z0.s, w8
+; CHECK-NEXT:    ret
+  %1 = insertelement <vscale x 4 x float> undef, float 3.3299999237060546875, i32 0
+  %2 = shufflevector <vscale x 4 x float> %1, <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer
+  ret <vscale x 4 x float> %2
+}
+
+define <vscale x 2 x double> @splat_nxv2f64_imm_out_of_range() {
+; CHECK-LABEL: splat_nxv2f64_imm_out_of_range:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI47_0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    add x8, x8, :lo12:.LCPI47_0
+; CHECK-NEXT:    ld1rd { z0.d }, p0/z, [x8]
+; CHECK-NEXT:    ret
+  %1 = insertelement <vscale x 2 x double> undef, double 3.33, i32 0
+  %2 = shufflevector <vscale x 2 x double> %1, <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer
+  ret <vscale x 2 x double> %2
+}
+
 ; +bf16 is required for the bfloat version.
 attributes #0 = { "target-features"="+sve,+bf16" }