[ARM] Rejig some of the MVE gather/scatter lowering pass. NFC

This adjusts some of how the gather/scatter lowering pass passes around data and where certain gathers/scatters are created from. It should not effect code generation on its own, but allows other patches to more clearly reason about the code. A number of extra test cases were also added for smaller gathers/ scatters that can be extended, and some of the test comments were updated.
2025-01-31 20:51:52 +01:00 · 2021-06-15 15:38:39 +01:00 · 2021-06-15 15:38:39 +01:00 · e55f90b072
commit e55f90b072
parent 30958bdc95
6 changed files with 200 additions and 56 deletions
--- a/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/lib/Target/ARM/MVEGatherScatterLowering.cpp
@ -84,8 +84,8 @@ private:
  // Check for a getelementptr and deduce base and offsets from it, on success
  // returning the base directly and the offsets indirectly using the Offsets
  // argument
-  Value *checkGEP(Value *&Offsets, FixedVectorType *Ty, GetElementPtrInst *GEP,
-                  IRBuilder<> &Builder);
+  Value *decomposeGEP(Value *&Offsets, FixedVectorType *Ty,
+                      GetElementPtrInst *GEP, IRBuilder<> &Builder);
  // Compute the scale of this gather/scatter instruction
  int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize);
  // If the value is a constant, or derived from constants via additions
@ -123,8 +123,7 @@ private:

  // QI gathers and scatters can increment their offsets on their own if
  // the increment is a constant value (digit)
-  Value *tryCreateIncrementingGatScat(IntrinsicInst *I, Value *BasePtr,
-                                      Value *Ptr, GetElementPtrInst *GEP,
+  Value *tryCreateIncrementingGatScat(IntrinsicInst *I, Value *Ptr,
                                      IRBuilder<> &Builder);
  // QI gathers/scatters can increment their offsets on their own if the
  // increment is a constant value (digit) - this creates a writeback QI
@ -214,9 +213,10 @@ static bool checkOffsetSize(Value *Offsets, unsigned TargetElemCount) {
  return true;
 }

-Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, FixedVectorType *Ty,
-                                          GetElementPtrInst *GEP,
-                                          IRBuilder<> &Builder) {
+Value *MVEGatherScatterLowering::decomposeGEP(Value *&Offsets,
+                                              FixedVectorType *Ty,
+                                              GetElementPtrInst *GEP,
+                                              IRBuilder<> &Builder) {
  if (!GEP) {
    LLVM_DEBUG(dbgs() << "masked gathers/scatters: no getelementpointer "
                      << "found\n");
@ -372,7 +372,10 @@ Value *MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
  Builder.SetCurrentDebugLocation(I->getDebugLoc());

  Instruction *Root = I;
-  Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Root, Builder);
+
+  Value *Load = tryCreateIncrementingGatScat(I, Ptr, Builder);
+  if (!Load)
+    Load = tryCreateMaskedGatherOffset(I, Ptr, Root, Builder);
  if (!Load)
    Load = tryCreateMaskedGatherBase(I, Ptr, Builder);
  if (!Load)
@ -478,14 +481,9 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
  Value *Offsets;
  Value *BasePtr =
-      checkGEP(Offsets, cast<FixedVectorType>(ResultTy), GEP, Builder);
+      decomposeGEP(Offsets, cast<FixedVectorType>(ResultTy), GEP, Builder);
  if (!BasePtr)
    return nullptr;
-  // Check whether the offset is a constant increment that could be merged into
-  // a QI gather
-  Value *Load = tryCreateIncrementingGatScat(I, BasePtr, Offsets, GEP, Builder);
-  if (Load)
-    return Load;

  int Scale =
      computeScale(GEP->getSourceElementType()->getPrimitiveSizeInBits(),
@ -533,7 +531,9 @@ Value *MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) {
  Builder.SetInsertPoint(I);
  Builder.SetCurrentDebugLocation(I->getDebugLoc());

-  Value *Store = tryCreateMaskedScatterOffset(I, Ptr, Builder);
+  Value *Store = tryCreateIncrementingGatScat(I, Ptr, Builder);
+  if (!Store)
+    Store = tryCreateMaskedScatterOffset(I, Ptr, Builder);
  if (!Store)
    Store = tryCreateMaskedScatterBase(I, Ptr, Builder);
  if (!Store)
@ -598,6 +598,7 @@ Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
  Value *Mask = I->getArgOperand(3);
  Type *InputTy = Input->getType();
  Type *MemoryTy = InputTy;
+
  LLVM_DEBUG(dbgs() << "masked scatters: getelementpointer found. Storing"
                    << " to base + vector of offsets\n");
  // If the input has been truncated, try to integrate that trunc into the
@ -619,15 +620,10 @@ Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
  Value *Offsets;
  Value *BasePtr =
-      checkGEP(Offsets, cast<FixedVectorType>(InputTy), GEP, Builder);
+      decomposeGEP(Offsets, cast<FixedVectorType>(InputTy), GEP, Builder);
  if (!BasePtr)
    return nullptr;
-  // Check whether the offset is a constant increment that could be merged into
-  // a QI gather
-  Value *Store =
-      tryCreateIncrementingGatScat(I, BasePtr, Offsets, GEP, Builder);
-  if (Store)
-    return Store;
+
  int Scale =
      computeScale(GEP->getSourceElementType()->getPrimitiveSizeInBits(),
                   MemoryTy->getScalarSizeInBits());
@ -652,21 +648,28 @@ Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
 }

 Value *MVEGatherScatterLowering::tryCreateIncrementingGatScat(
-    IntrinsicInst *I, Value *BasePtr, Value *Offsets, GetElementPtrInst *GEP,
-    IRBuilder<> &Builder) {
+    IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder) {
  FixedVectorType *Ty;
  if (I->getIntrinsicID() == Intrinsic::masked_gather)
    Ty = cast<FixedVectorType>(I->getType());
  else
    Ty = cast<FixedVectorType>(I->getArgOperand(0)->getType());
+
  // Incrementing gathers only exist for v4i32
-  if (Ty->getNumElements() != 4 ||
-      Ty->getScalarSizeInBits() != 32)
+  if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32)
    return nullptr;
+  // Incrementing gathers are not beneficial outside of a loop
  Loop *L = LI->getLoopFor(I->getParent());
  if (L == nullptr)
-    // Incrementing gathers are not beneficial outside of a loop
    return nullptr;
+
+  // Decompose the GEP into Base and Offsets
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  Value *Offsets;
+  Value *BasePtr = decomposeGEP(Offsets, Ty, GEP, Builder);
+  if (!BasePtr)
+    return nullptr;
+
  LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to build incrementing "
                       "wb gather/scatter\n");

@ -689,6 +692,7 @@ Value *MVEGatherScatterLowering::tryCreateIncrementingGatScat(
    if (Load != nullptr)
      return Load;
  }
+
  LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to build incrementing "
                       "non-wb gather/scatter\n");

--- a/test/CodeGen/Thumb2/mve-gather-ptrs.ll
+++ b/test/CodeGen/Thumb2/mve-gather-ptrs.ll
@ -273,6 +273,25 @@ entry:
  ret <4 x i32> %ext
 }

+define arm_aapcs_vfpcc <4 x i16> @ptr_v4i16(<4 x i16*>* %offptr) {
+; CHECK-LABEL: ptr_v4i16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    ldrh r0, [r0]
+; CHECK-NEXT:    ldrh r2, [r2]
+; CHECK-NEXT:    ldrh r1, [r1]
+; CHECK-NEXT:    ldrh r3, [r3]
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    bx lr
+entry:
+  %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4
+  %gather = call <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i16> undef)
+  ret <4 x i16> %gather
+}
+
 define arm_aapcs_vfpcc <8 x i32> @ptr_v8i16_sext(<8 x i16*>* %offptr) {
 ; CHECK-LABEL: ptr_v8i16_sext:
 ; CHECK:       @ %bb.0: @ %entry
@ -369,6 +388,25 @@ entry:
  ret <8 x half> %gather
 }

+define arm_aapcs_vfpcc <4 x half> @ptr_v4f16(<4 x half*>* %offptr) {
+; CHECK-LABEL: ptr_v4f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov r0, r1, d2
+; CHECK-NEXT:    vldr.16 s8, [r1]
+; CHECK-NEXT:    vldr.16 s0, [r0]
+; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    vins.f16 s0, s8
+; CHECK-NEXT:    vldr.16 s4, [r1]
+; CHECK-NEXT:    vldr.16 s1, [r0]
+; CHECK-NEXT:    vins.f16 s1, s4
+; CHECK-NEXT:    bx lr
+entry:
+  %offs = load <4 x half*>, <4 x half*>* %offptr, align 4
+  %gather = call <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x half> undef)
+  ret <4 x half> %gather
+}
+
 ; i8

 define arm_aapcs_vfpcc <16 x i8> @ptr_i8(<16 x i8*>* %offptr) {
@ -499,6 +537,40 @@ entry:
  ret <8 x i16> %ext
 }

+define arm_aapcs_vfpcc <8 x i8> @ptr_v8i8(<8 x i8*>* %offptr) {
+; CHECK-LABEL: ptr_v8i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .save {r4, r5, r6, lr}
+; CHECK-NEXT:    push {r4, r5, r6, lr}
+; CHECK-NEXT:    vldrw.u32 q0, [r0, #16]
+; CHECK-NEXT:    vmov r1, r2, d0
+; CHECK-NEXT:    vmov r3, r12, d1
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov r4, r5, d0
+; CHECK-NEXT:    vmov r0, lr, d1
+; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    ldrb r6, [r3]
+; CHECK-NEXT:    ldrb r2, [r2]
+; CHECK-NEXT:    ldrb r4, [r4]
+; CHECK-NEXT:    ldrb r5, [r5]
+; CHECK-NEXT:    vmov.16 q0[0], r4
+; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    vmov.16 q0[1], r5
+; CHECK-NEXT:    ldrb.w r3, [lr]
+; CHECK-NEXT:    vmov.16 q0[2], r0
+; CHECK-NEXT:    ldrb.w r12, [r12]
+; CHECK-NEXT:    vmov.16 q0[3], r3
+; CHECK-NEXT:    vmov.16 q0[4], r1
+; CHECK-NEXT:    vmov.16 q0[5], r2
+; CHECK-NEXT:    vmov.16 q0[6], r6
+; CHECK-NEXT:    vmov.16 q0[7], r12
+; CHECK-NEXT:    pop {r4, r5, r6, pc}
+entry:
+  %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4
+  %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %offs, i32 1, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i8> undef)
+  ret <8 x i8> %gather
+}
+
 define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_sext32(<4 x i8*>* %offptr) {
 ; CHECK-LABEL: ptr_v4i8_sext32:
 ; CHECK:       @ %bb.0: @ %entry
@ -543,6 +615,25 @@ entry:
  ret <4 x i32> %ext
 }

+define arm_aapcs_vfpcc <4 x i8> @ptr_v4i8(<4 x i8*>* %offptr) {
+; CHECK-LABEL: ptr_v4i8:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov r0, r1, d1
+; CHECK-NEXT:    vmov r2, r3, d0
+; CHECK-NEXT:    ldrb r0, [r0]
+; CHECK-NEXT:    ldrb r2, [r2]
+; CHECK-NEXT:    ldrb r1, [r1]
+; CHECK-NEXT:    ldrb r3, [r3]
+; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
+; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
+; CHECK-NEXT:    bx lr
+entry:
+  %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4
+  %gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %offs, i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> undef)
+  ret <4 x i8> %gather
+}
+
 define arm_aapcs_vfpcc <8 x i32> @ptr_v8i8_sext32(<8 x i8*>* %offptr) {
 ; CHECK-LABEL: ptr_v8i8_sext32:
 ; CHECK:       @ %bb.0: @ %entry
@ -623,17 +714,17 @@ define void @foo_ptr_p_int32_t(i32* %dest, i32** %src, i32 %n) {
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    poplt {r7, pc}
-; CHECK-NEXT:  .LBB22_1: @ %vector.body.preheader
+; CHECK-NEXT:  .LBB26_1: @ %vector.body.preheader
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    add.w lr, r3, r2, lsr #2
-; CHECK-NEXT:  .LBB22_2: @ %vector.body
+; CHECK-NEXT:  .LBB26_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    vptt.i32 ne, q0, zr
 ; CHECK-NEXT:    vldrwt.u32 q1, [q0]
 ; CHECK-NEXT:    vstrwt.32 q1, [r0], #16
-; CHECK-NEXT:    le lr, .LBB22_2
+; CHECK-NEXT:    le lr, .LBB26_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.end
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
@ -668,17 +759,17 @@ define void @foo_ptr_p_float(float* %dest, float** %src, i32 %n) {
 ; CHECK-NEXT:    cmp r2, #1
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    poplt {r7, pc}
-; CHECK-NEXT:  .LBB23_1: @ %vector.body.preheader
+; CHECK-NEXT:  .LBB27_1: @ %vector.body.preheader
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    movs r3, #1
 ; CHECK-NEXT:    add.w lr, r3, r2, lsr #2
-; CHECK-NEXT:  .LBB23_2: @ %vector.body
+; CHECK-NEXT:  .LBB27_2: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    vptt.i32 ne, q0, zr
 ; CHECK-NEXT:    vldrwt.u32 q1, [q0]
 ; CHECK-NEXT:    vstrwt.32 q1, [r0], #16
-; CHECK-NEXT:    le lr, .LBB23_2
+; CHECK-NEXT:    le lr, .LBB27_2
 ; CHECK-NEXT:  @ %bb.3: @ %for.end
 ; CHECK-NEXT:    pop {r7, pc}
 entry:
--- a/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
+++ b/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll
@ -47,7 +47,7 @@ entry:
  ret void
 }

-; Expand
+; Expand - sext offset
 define arm_aapcs_vfpcc void @scaled_v8i16_sext(i16* %base, <8 x i16>* %offptr, <8 x i16> %input) {
 ; CHECK-LABEL: scaled_v8i16_sext:
 ; CHECK:       @ %bb.0: @ %entry
@ -88,7 +88,7 @@ entry:
  ret void
 }

-; Expand
+; Expand - sext offset
 define arm_aapcs_vfpcc void @scaled_v8f16_sext(i16* %base, <8 x i16>* %offptr, <8 x half> %input) {
 ; CHECK-LABEL: scaled_v8f16_sext:
 ; CHECK:       @ %bb.0: @ %entry
--- a/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
+++ b/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll
@ -65,7 +65,7 @@ entry:
  ret void
 }

-; Expand
+; Expand - sext offsets
 define arm_aapcs_vfpcc void @unscaled_v8i16_sext(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) {
 ; CHECK-LABEL: unscaled_v8i16_sext:
 ; CHECK:       @ %bb.0: @ %entry
@ -105,7 +105,7 @@ entry:
  ret void
 }

-; Expand
+; Expand - sext offsets
 define arm_aapcs_vfpcc void @unscaled_v8f16_sext(i8* %base, <8 x i16>* %offptr, <8 x half> %input) {
 ; CHECK-LABEL: unscaled_v8f16_sext:
 ; CHECK:       @ %bb.0: @ %entry
@ -139,7 +139,7 @@ entry:
  ret void
 }

-; Expand
+; Expand - i32 offsets
 define arm_aapcs_vfpcc void @unscaled_v8i16_noext(i8* %base, <8 x i32>* %offptr, <8 x i16> %input) {
 ; CHECK-LABEL: unscaled_v8i16_noext:
 ; CHECK:       @ %bb.0: @ %entry
@ -178,7 +178,7 @@ entry:
  ret void
 }

-; Expand
+; Expand - i32 offsets
 define arm_aapcs_vfpcc void @unscaled_v8f16_noext(i8* %base, <8 x i32>* %offptr, <8 x half> %input) {
 ; CHECK-LABEL: unscaled_v8f16_noext:
 ; CHECK:       @ %bb.0: @ %entry
@ -243,7 +243,7 @@ entry:
  ret void
 }

-; Expand ?
+; Expand - sext offsets
 define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(i8* %base, <8 x i8>* %offptr, <8 x i64> %input) {
 ; CHECK-LABEL: trunc_signed_unscaled_i64_i8:
 ; CHECK:       @ %bb.0: @ %entry
@ -287,7 +287,6 @@ entry:
  ret void
 }

-; Expand ?
 define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <8 x i8>* %offptr, <8 x i64> %input) {
 ; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8:
 ; CHECK:       @ %bb.0: @ %entry
@ -323,7 +322,7 @@ entry:
  ret void
 }

-; Expand ?
+; Expand - sext offsets
 define arm_aapcs_vfpcc void @trunc_signed_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) {
 ; CHECK-LABEL: trunc_signed_unscaled_i32_i8:
 ; CHECK:       @ %bb.0: @ %entry
@ -360,7 +359,6 @@ entry:
  ret void
 }

-; Expand ?
 define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) {
 ; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
 ; CHECK:       @ %bb.0: @ %entry
@ -391,7 +389,7 @@ entry:
  ret void
 }

-; Expand ?
+; Expand - sext offsets
 define arm_aapcs_vfpcc void @trunc_signed_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) {
 ; CHECK-LABEL: trunc_signed_unscaled_i16_i8:
 ; CHECK:       @ %bb.0: @ %entry
--- a/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
+++ b/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll
@ -16,7 +16,7 @@ entry:
  ret void
 }

-; Expand
+; Expanded ?
 define arm_aapcs_vfpcc void @unscaled_v8i8_i8(i8* %base, <8 x i8>* %offptr, <8 x i8> %input) {
 ; CHECK-LABEL: unscaled_v8i8_i8:
 ; CHECK:       @ %bb.0: @ %entry
@ -79,7 +79,7 @@ entry:
  ret void
 }

-; Expand
+; Expand - sext offsets
 define arm_aapcs_vfpcc void @unscaled_v16i8_sext(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) {
 ; CHECK-LABEL: unscaled_v16i8_sext:
 ; CHECK:       @ %bb.0: @ %entry
@ -142,7 +142,7 @@ entry:
  ret void
 }

-; Expand
+; Expand - sext offsets
 define arm_aapcs_vfpcc void @unscaled_v16i8_i16(i8* %base, <16 x i16>* %offptr, <16 x i8> %input) {
 ; CHECK-LABEL: unscaled_v16i8_i16:
 ; CHECK:       @ %bb.0: @ %entry
@ -205,7 +205,7 @@ entry:
  ret void
 }

-; Expand
+; Could be manually scaled offsets
 define arm_aapcs_vfpcc void @unscaled_v16i8_scaled(i32* %base, <16 x i8>* %offptr, <16 x i8> %input) {
 ; CHECK-LABEL: unscaled_v16i8_scaled:
 ; CHECK:       @ %bb.0: @ %entry
@ -273,7 +273,7 @@ entry:
  ret void
 }

-; Expand
+; Expand - large offsets
 define arm_aapcs_vfpcc void @unscaled_v16i8_i8_next(i8* %base, <16 x i32>* %offptr, <16 x i8> %input) {
 ; CHECK-LABEL: unscaled_v16i8_i8_next:
 ; CHECK:       @ %bb.0: @ %entry
@ -335,7 +335,6 @@ entry:
  ret void
 }

-; Expand
 define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <16 x i8>* %offptr, <16 x i64> %input) {
 ; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8:
 ; CHECK:       @ %bb.0: @ %entry
@ -396,7 +395,6 @@ entry:
  ret void
 }

-; Expand
 define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <16 x i8>* %offptr, <16 x i32> %input) {
 ; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8:
 ; CHECK:       @ %bb.0: @ %entry
@ -441,7 +439,6 @@ entry:
  ret void
 }

-; Expand
 define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <16 x i8>* %offptr, <16 x i16> %input) {
 ; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8:
 ; CHECK:       @ %bb.0: @ %entry
--- a/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
+++ b/test/CodeGen/Thumb2/mve-scatter-ptrs.ll
@ -251,6 +251,24 @@ entry:
  ret void
 }

+define arm_aapcs_vfpcc void @ptr_v4i16_dup(i32 %v, <4 x i16*> %offs) {
+; CHECK-LABEL: ptr_v4i16_dup:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r1, r2, d0
+; CHECK-NEXT:    vmov r3, r12, d1
+; CHECK-NEXT:    strh r0, [r1]
+; CHECK-NEXT:    strh r0, [r2]
+; CHECK-NEXT:    strh r0, [r3]
+; CHECK-NEXT:    strh.w r0, [r12]
+; CHECK-NEXT:    bx lr
+entry:
+  %ext = trunc i32 %v to i16
+  %splatinsert = insertelement <4 x i16> poison, i16 %ext, i32 0
+  %splat = shufflevector <4 x i16> %splatinsert, <4 x i16> poison, <4 x i32> zeroinitializer
+  call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %splat, <4 x i16*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
 ; Expand
 define arm_aapcs_vfpcc void @ptr_v8i16_trunc(<8 x i32> %v, <8 x i16*>* %offptr) {
 ; CHECK-LABEL: ptr_v8i16_trunc:
@ -314,6 +332,42 @@ entry:
  ret void
 }

+define arm_aapcs_vfpcc void @ptr_v4f16(<4 x half> %v, <4 x half*>* %offptr) {
+; CHECK-LABEL: ptr_v4f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmovx.f16 s8, s0
+; CHECK-NEXT:    vmov r0, r1, d2
+; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vstr.16 s8, [r1]
+; CHECK-NEXT:    vmov r0, r1, d3
+; CHECK-NEXT:    vmovx.f16 s0, s1
+; CHECK-NEXT:    vstr.16 s1, [r0]
+; CHECK-NEXT:    vstr.16 s0, [r1]
+; CHECK-NEXT:    bx lr
+entry:
+  %offs = load <4 x half*>, <4 x half*>* %offptr, align 4
+  call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> %v, <4 x half*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+define arm_aapcs_vfpcc void @ptr_v4f16_dup(half %v, <4 x half*> %offs) {
+; CHECK-LABEL: ptr_v4f16_dup:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov r0, r1, d2
+; CHECK-NEXT:    vmov r2, r3, d3
+; CHECK-NEXT:    vstr.16 s0, [r0]
+; CHECK-NEXT:    vstr.16 s0, [r1]
+; CHECK-NEXT:    vstr.16 s0, [r2]
+; CHECK-NEXT:    vstr.16 s0, [r3]
+; CHECK-NEXT:    bx lr
+entry:
+  %splatinsert = insertelement <4 x half> poison, half %v, i32 0
+  %splat = shufflevector <4 x half> %splatinsert, <4 x half> poison, <4 x i32> zeroinitializer
+  call void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half> %splat, <4 x half*> %offs, i32 2, <4 x i1> <i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
 ; i8

 ; Expand.
@ -473,14 +527,14 @@ define void @foo_ptr_p_int32_t(i32* %dest, i32** %src, i32 %n) {
 ; CHECK-NEXT:    cmp r3, #1
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    bxlt lr
-; CHECK-NEXT:  .LBB16_1: @ %vector.body
+; CHECK-NEXT:  .LBB19_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vptt.i32 ne, q0, zr
 ; CHECK-NEXT:    vldrwt.u32 q1, [r0], #16
 ; CHECK-NEXT:    vstrwt.32 q1, [q0]
-; CHECK-NEXT:    bne .LBB16_1
+; CHECK-NEXT:    bne .LBB19_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.end
 ; CHECK-NEXT:    bx lr
 entry:
@ -513,14 +567,14 @@ define void @foo_ptr_p_float(float* %dest, float** %src, i32 %n) {
 ; CHECK-NEXT:    cmp r3, #1
 ; CHECK-NEXT:    it lt
 ; CHECK-NEXT:    bxlt lr
-; CHECK-NEXT:  .LBB17_1: @ %vector.body
+; CHECK-NEXT:  .LBB20_1: @ %vector.body
 ; CHECK-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    vldrw.u32 q0, [r1], #16
 ; CHECK-NEXT:    subs r2, #4
 ; CHECK-NEXT:    vptt.i32 ne, q0, zr
 ; CHECK-NEXT:    vldrwt.u32 q1, [r0], #16
 ; CHECK-NEXT:    vstrwt.32 q1, [q0]
-; CHECK-NEXT:    bne .LBB17_1
+; CHECK-NEXT:    bne .LBB20_1
 ; CHECK-NEXT:  @ %bb.2: @ %for.end
 ; CHECK-NEXT:    bx lr
 entry: