[ARM] Masked loads and stores

Masked loads and store fit naturally with MVE, the instructions being easily predicated. This adds lowering for the simple cases of masked loads and stores. It does not yet deal with widening/narrowing or pre/post inc, and so is currently behind an option. The llvm masked load intrinsic will accept a "passthru" value, dictating the values used for the zero masked lanes. In MVE the instructions write 0 to the zero predicated lanes, so we need to match a passthru that isn't 0 (or undef) with a select instruction to pull in the correct data after the load. Differential Revision: https://reviews.llvm.org/D67186 llvm-svn: 371932
2025-01-31 12:41:49 +01:00 · 2019-09-15 14:14:47 +00:00 · 2019-09-15 14:14:47 +00:00 · e94e59a2ae
commit e94e59a2ae
parent b49e547030
8 changed files with 647 additions and 7037 deletions
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@ -259,6 +259,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
    setOperationAction(ISD::UMAX, VT, Legal);
    setOperationAction(ISD::ABS, VT, Legal);
    setOperationAction(ISD::SETCC, VT, Custom);
+    setOperationAction(ISD::MLOAD, VT, Custom);
+    setOperationAction(ISD::MSTORE, VT, Legal);

    // No native support for these.
    setOperationAction(ISD::UDIV, VT, Expand);
@ -304,6 +306,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
    setOperationAction(ISD::BUILD_VECTOR, VT.getVectorElementType(), Custom);
    setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Legal);
    setOperationAction(ISD::SETCC, VT, Custom);
+    setOperationAction(ISD::MLOAD, VT, Custom);
+    setOperationAction(ISD::MSTORE, VT, Legal);

    // Pre and Post inc are supported on loads and stores
    for (unsigned im = (unsigned)ISD::PRE_INC;
@ -8848,6 +8852,31 @@ static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG) {
      ST->getMemOperand());
 }

+static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
+  MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
+  MVT VT = Op.getSimpleValueType();
+  SDValue Mask = N->getMask();
+  SDValue PassThru = N->getPassThru();
+  SDLoc dl(Op);
+
+  if (ISD::isBuildVectorAllZeros(PassThru.getNode()) ||
+      (PassThru->getOpcode() == ARMISD::VMOVIMM &&
+       isNullConstant(PassThru->getOperand(0))))
+    return Op;
+
+  // MVE Masked loads use zero as the passthru value. Here we convert undef to
+  // zero too, and other values are lowered to a select.
+  SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
+                                DAG.getTargetConstant(0, dl, MVT::i32));
+  SDValue NewLoad = DAG.getMaskedLoad(
+      VT, dl, N->getChain(), N->getBasePtr(), Mask, ZeroVec, N->getMemoryVT(),
+      N->getMemOperand(), N->getExtensionType(), N->isExpandingLoad());
+  SDValue Combo = NewLoad;
+  if (!PassThru.isUndef())
+    Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
+  return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
+}
+
 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
  if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
    // Acquire/Release load/store is not legal for targets without a dmb or
@ -9051,6 +9080,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
    return LowerPredicateLoad(Op, DAG);
  case ISD::STORE:
    return LowerPredicateStore(Op, DAG);
+  case ISD::MLOAD:
+    return LowerMLOAD(Op, DAG);
  case ISD::ATOMIC_LOAD:
  case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
  case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
--- a/lib/Target/ARM/ARMInstrMVE.td
+++ b/lib/Target/ARM/ARMInstrMVE.td
@ -4892,6 +4892,10 @@ class MVE_vector_store_typed<ValueType Ty, Instruction RegImmInst,
                             PatFrag StoreKind, int shift>
  : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr),
        (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr)>;
+class MVE_vector_maskedstore_typed<ValueType Ty, Instruction RegImmInst,
+                                   PatFrag StoreKind, int shift>
+  : Pat<(StoreKind (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, VCCR:$pred),
+        (RegImmInst (Ty MQPR:$val), t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred)>;

 multiclass MVE_vector_store<Instruction RegImmInst, PatFrag StoreKind,
                            int shift> {
@ -4908,6 +4912,10 @@ class MVE_vector_load_typed<ValueType Ty, Instruction RegImmInst,
                            PatFrag LoadKind, int shift>
  : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr)),
        (Ty (RegImmInst t2addrmode_imm7<shift>:$addr))>;
+class MVE_vector_maskedload_typed<ValueType Ty, Instruction RegImmInst,
+                                  PatFrag LoadKind, int shift>
+  : Pat<(Ty (LoadKind t2addrmode_imm7<shift>:$addr, VCCR:$pred, (Ty NEONimmAllZerosV))),
+        (Ty (RegImmInst t2addrmode_imm7<shift>:$addr, (i32 1), VCCR:$pred))>;

 multiclass MVE_vector_load<Instruction RegImmInst, PatFrag LoadKind,
                           int shift> {
@ -4953,6 +4961,28 @@ def aligned16_post_store : PatFrag<(ops node:$val, node:$ptr, node:$offset),
  return cast<StoreSDNode>(N)->getAlignment() >= 2;
 }]>;

+def alignedmaskedload32 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                                  (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+  return cast<MaskedLoadSDNode>(N)->getAlignment() >= 4;
+}]>;
+def alignedmaskedload16 : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                                  (masked_ld node:$ptr, node:$pred, node:$passthru), [{
+  return cast<MaskedLoadSDNode>(N)->getAlignment() >= 2;
+}]>;
+def maskedload : PatFrag<(ops node:$ptr, node:$pred, node:$passthru),
+                         (masked_ld node:$ptr, node:$pred, node:$passthru)>;
+
+def alignedmaskedstore32 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+                                   (masked_st node:$val, node:$ptr, node:$pred), [{
+  return cast<MaskedStoreSDNode>(N)->getAlignment() >= 4;
+}]>;
+def alignedmaskedstore16 : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+                                   (masked_st node:$val, node:$ptr, node:$pred), [{
+  return cast<MaskedStoreSDNode>(N)->getAlignment() >= 2;
+}]>;
+def maskedstore : PatFrag<(ops node:$val, node:$ptr, node:$pred),
+                          (masked_st node:$val, node:$ptr, node:$pred)>;
+
 let Predicates = [HasMVEInt, IsLE] in {
  // Stores
  defm : MVE_vector_store<MVE_VSTRBU8, byte_alignedstore, 0>;
@ -4971,6 +5001,26 @@ let Predicates = [HasMVEInt, IsLE] in {
  defm : MVE_vector_offset_store<MVE_VSTRHU16_post, aligned16_post_store, 1>;
  defm : MVE_vector_offset_store<MVE_VSTRWU32_pre, aligned32_pre_store, 2>;
  defm : MVE_vector_offset_store<MVE_VSTRWU32_post, aligned32_post_store, 2>;
+
+  // Unaligned masked stores (aligned are below)
+  def : Pat<(maskedstore (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  def : Pat<(maskedstore (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  def : Pat<(maskedstore (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  def : Pat<(maskedstore (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 MQPR:$val, t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+
+  // Unaligned masked loads
+  def : Pat<(v4i32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4i32 NEONimmAllZerosV))),
+            (v4i32 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+  def : Pat<(v4f32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4f32 NEONimmAllZerosV))),
+            (v4f32 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+  def : Pat<(v8i16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8i16 NEONimmAllZerosV))),
+            (v8i16 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
+  def : Pat<(v8f16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8f16 NEONimmAllZerosV))),
+            (v8f16 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred))>;
 }

 let Predicates = [HasMVEInt, IsBE] in {
@ -5025,8 +5075,41 @@ let Predicates = [HasMVEInt, IsBE] in {
  def : MVE_vector_offset_store_typed<v4i32, MVE_VSTRWU32_post, aligned32_post_store, 2>;
  def : MVE_vector_offset_store_typed<v4f32, MVE_VSTRWU32_pre, aligned32_pre_store, 2>;
  def : MVE_vector_offset_store_typed<v4f32, MVE_VSTRWU32_post, aligned32_post_store, 2>;
+
+  // Unaligned masked stores (aligned are below)
+  def : Pat<(maskedstore (v4i32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  def : Pat<(maskedstore (v4f32 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 (MVE_VREV32_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  def : Pat<(maskedstore (v8i16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  def : Pat<(maskedstore (v8f16 MQPR:$val), t2addrmode_imm7<0>:$addr, VCCR:$pred),
+            (MVE_VSTRBU8 (MVE_VREV16_8 MQPR:$val), t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)>;
+  // Unaligned masked loads
+  def : Pat<(v4i32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4i32 NEONimmAllZerosV))),
+            (v4i32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>;
+  def : Pat<(v4f32 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v4f32 NEONimmAllZerosV))),
+            (v4f32 (MVE_VREV32_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>;
+  def : Pat<(v8i16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8i16 NEONimmAllZerosV))),
+            (v8i16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>;
+  def : Pat<(v8f16 (maskedload t2addrmode_imm7<0>:$addr, VCCR:$pred, (v8f16 NEONimmAllZerosV))),
+            (v8f16 (MVE_VREV16_8 (MVE_VLDRBU8 t2addrmode_imm7<0>:$addr, (i32 1), VCCR:$pred)))>;
 }

+let Predicates = [HasMVEInt] in {
+  // Aligned masked store, shared between LE and BE
+  def : MVE_vector_maskedstore_typed<v16i8, MVE_VSTRBU8, maskedstore, 0>;
+  def : MVE_vector_maskedstore_typed<v8i16, MVE_VSTRHU16, alignedmaskedstore16, 1>;
+  def : MVE_vector_maskedstore_typed<v8f16, MVE_VSTRHU16, alignedmaskedstore16, 1>;
+  def : MVE_vector_maskedstore_typed<v4i32, MVE_VSTRWU32, alignedmaskedstore32, 2>;
+  def : MVE_vector_maskedstore_typed<v4f32, MVE_VSTRWU32, alignedmaskedstore32, 2>;
+  // Aligned masked loads
+  def : MVE_vector_maskedload_typed<v16i8, MVE_VLDRBU8, maskedload, 0>;
+  def : MVE_vector_maskedload_typed<v8i16, MVE_VLDRHU16, alignedmaskedload16, 1>;
+  def : MVE_vector_maskedload_typed<v8f16, MVE_VLDRHU16, alignedmaskedload16, 1>;
+  def : MVE_vector_maskedload_typed<v4i32, MVE_VLDRWU32, alignedmaskedload32, 2>;
+  def : MVE_vector_maskedload_typed<v4f32, MVE_VLDRWU32, alignedmaskedload32, 2>;
+}

 // Widening/Narrowing Loads/Stores

--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@ -36,6 +36,10 @@ using namespace llvm;

 #define DEBUG_TYPE "armtti"

+static cl::opt<bool> EnableMaskedLoadStores(
+  "enable-arm-maskedldst", cl::Hidden, cl::init(false),
+  cl::desc("Enable the generation of masked loads and stores"));
+
 static cl::opt<bool> DisableLowOverheadLoops(
  "disable-arm-loloops", cl::Hidden, cl::init(false),
  cl::desc("Disable the generation of low-overhead loops"));
@ -487,6 +491,22 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
  return BaseT::getAddressComputationCost(Ty, SE, Ptr);
 }

+bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy) {
+  if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
+    return false;
+
+  if (DataTy->isVectorTy()) {
+    // We don't yet support narrowing or widening masked loads/stores. Expand
+    // them for the moment.
+    unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
+    if (VecWidth != 128)
+      return false;
+  }
+
+  unsigned EltWidth = DataTy->getScalarSizeInBits();
+  return EltWidth == 32 || EltWidth == 16 || EltWidth == 8;
+}
+
 int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
  const MemCpyInst *MI = dyn_cast<MemCpyInst>(I);
  assert(MI && "MemcpyInst expected");
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@ -152,6 +152,9 @@ public:
    return ST->getMaxInterleaveFactor();
  }

+  bool isLegalMaskedLoad(Type *DataTy);
+  bool isLegalMaskedStore(Type *DataTy) { return isLegalMaskedLoad(DataTy); }
+
  int getMemcpyCost(const Instruction *I);

  int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
--- a/test/CodeGen/Thumb2/mve-masked-ldst.ll
+++ b/test/CodeGen/Thumb2/mve-masked-ldst.ll
--- a/test/CodeGen/Thumb2/mve-masked-load.ll
+++ b/test/CodeGen/Thumb2/mve-masked-load.ll
--- a/test/CodeGen/Thumb2/mve-masked-store.ll
+++ b/test/CodeGen/Thumb2/mve-masked-store.ll
--- a/test/Transforms/LoopVectorize/ARM/mve-maskedldst.ll
+++ b/test/Transforms/LoopVectorize/ARM/mve-maskedldst.ll
@ -0,0 +1,40 @@
+; RUN: opt -loop-vectorize -enable-arm-maskedldst < %s -S -o - | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1-m.main-none-eabi"
+
+; CHECK-LABEL: test
+; CHECK: llvm.masked.store.v4i32.p0v4i32
+define void @test(i32* nocapture %A, i32 %n) #0 {
+entry:
+  %cmp12 = icmp sgt i32 %n, 0
+  br i1 %cmp12, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %i.013 = phi i32 [ %inc, %for.inc ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.013
+  %0 = load i32, i32* %arrayidx, align 4
+  %.off = add i32 %0, 9
+  %1 = icmp ult i32 %.off, 19
+  br i1 %1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  store i32 0, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %inc = add nuw nsw i32 %i.013, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+}
+
+attributes #0 = { "target-features"="+mve" }