[IR] Introduce llvm.experimental.vector.splice intrinsic

This patch introduces a new intrinsic @llvm.experimental.vector.splice that constructs a vector of the same type as the two input vectors, based on a immediate where the sign of the immediate distinguishes two variants. A positive immediate specifies an index into the first vector and a negative immediate specifies the number of trailing elements to extract from the first vector. For example: @llvm.experimental.vector.splice(<A,B,C,D>, <E,F,G,H>, 1) ==> <B, C, D, E> ; index @llvm.experimental.vector.splice(<A,B,C,D>, <E,F,G,H>, -3) ==> <B, C, D, E> ; trailing element count These intrinsics support both fixed and scalable vectors, where the former is lowered to a shufflevector to maintain existing behaviour, although while marked as experimental the recommended way to express this operation for fixed-width vectors is to use shufflevector. For scalable vectors where it is not possible to express a shufflevector mask for this operation, a new ISD node has been implemented. This is one of the named shufflevector intrinsics proposed on the mailing-list in the RFC at [1]. Patch by Paul Walker and Cullen Rhodes. [1] https://lists.llvm.org/pipermail/llvm-dev/2020-November/146864.html Reviewed By: sdesmalen Differential Revision: https://reviews.llvm.org/D94708
2024-11-22 10:42:39 +01:00 · 2021-01-08 14:06:13 +00:00 · 2021-01-08 14:06:13 +00:00 · 6682076a17
commit 6682076a17
parent ee05374523
17 changed files with 1691 additions and 3 deletions
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@ -16510,6 +16510,52 @@ Arguments:

 The argument to this intrinsic must be a vector.

+'``llvm.experimental.vector.splice``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %vec1, <2 x double> %vec2, i32 %imm)
+      declare <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2, i32 %imm)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.splice.*``' intrinsics construct a vector by
+concatenating elements from the first input vector with elements of the second
+input vector, returning a vector of the same type as the input vectors. The
+signed immediate, modulo the number of elements in the vector, is the index
+into the first vector from which to extract the result value. This means
+conceptually that for a positive immediate, a vector is extracted from
+``concat(%vec1, %vec2)`` starting at index ``imm``, whereas for a negative
+immediate, it extracts ``-imm`` trailing elements from the first vector, and
+the remaining elements from ``%vec2``.
+
+These intrinsics work for both fixed and scalable vectors. While this intrinsic
+is marked as experimental, the recommended way to express this operation for
+fixed-width vectors is still to use a shufflevector, as that may allow for more
+optimization opportunities.
+
+For example:
+
+.. code-block:: text
+
+ llvm.experimental.vector.splice(<A,B,C,D>, <E,F,G,H>, 1)  ==> <B, C, D, E> ; index
+ llvm.experimental.vector.splice(<A,B,C,D>, <E,F,G,H>, -3) ==> <B, C, D, E> ; trailing elements
+
+
+Arguments:
+""""""""""
+
+The first two operands are vectors with the same type. The third argument
+``imm`` is the start index, modulo VL, where VL is the runtime vector length of
+the source/result vector. The ``imm`` is a signed integer constant in the range
+``-VL <= imm < VL``. For values outside of this range the result is poison.
+
 Matrix Intrinsics
 -----------------

--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@ -556,6 +556,18 @@ enum NodeType {
  /// in terms of the element size of VEC1/VEC2, not in terms of bytes.
  VECTOR_SHUFFLE,

+  /// VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as
+  /// VEC1/VEC2 from CONCAT_VECTORS(VEC1, VEC2), based on the IMM in two ways.
+  /// Let the result type be T, if IMM is positive it represents the starting
+  /// element number (an index) from which a subvector of type T is extracted
+  /// from CONCAT_VECTORS(VEC1, VEC2). If IMM is negative it represents a count
+  /// specifying the number of trailing elements to extract from VEC1, where the
+  /// elements of T are selected using the following algorithm:
+  ///   RESULT[i] = CONCAT_VECTORS(VEC1,VEC2)[VEC1.ElementCount - ABS(IMM) + i]
+  /// If IMM is not in the range [-VL, VL-1] the result vector is undefined. IMM
+  /// is a constant integer.
+  VECTOR_SPLICE,
+
  /// SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a
  /// scalar value into element 0 of the resultant vector type.  The top
  /// elements 1 to N-1 of the N-element vector are undefined.  The type
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@ -4511,6 +4511,10 @@ public:
  /// Returns true if the expansion was successful.
  bool expandREM(SDNode *Node, SDValue &Result, SelectionDAG &DAG) const;

+  /// Method for building the DAG expansion of ISD::VECTOR_SPLICE. This
+  /// method accepts vectors as its arguments.
+  SDValue expandVectorSplice(SDNode *Node, SelectionDAG &DAG) const;
+
  //===--------------------------------------------------------------------===//
  // Instruction Emitting Hooks
  //
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@ -1659,6 +1659,13 @@ def int_experimental_vector_extract : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                                                            [llvm_anyvector_ty, llvm_i64_ty],
                                                            [IntrNoMem, ImmArg<ArgIndex<1>>]>;

+//===---------- Named shufflevector intrinsics ------===//
+def int_experimental_vector_splice : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                                           [LLVMMatchType<0>,
+                                                            LLVMMatchType<0>,
+                                                            llvm_i32_ty],
+                                                           [IntrNoMem, ImmArg<ArgIndex<2>>]>;
+
 //===----------------------------------------------------------------------===//

 //===----------------------------------------------------------------------===//
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@ -241,6 +241,9 @@ def SDTMaskedLoad: SDTypeProfile<1, 4, [       // masked load
 def SDTVecShuffle : SDTypeProfile<1, 2, [
  SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
 ]>;
+def SDTVecSlice : SDTypeProfile<1, 3, [     // vector splice
+  SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisInt<3>
+]>;
 def SDTVecExtract : SDTypeProfile<1, 2, [   // vector extract
  SDTCisEltOfVec<0, 1>, SDTCisPtrTy<2>
 ]>;
@ -655,6 +658,7 @@ def ist        : SDNode<"ISD::STORE"      , SDTIStore,

 def vector_shuffle : SDNode<"ISD::VECTOR_SHUFFLE", SDTVecShuffle, []>;
 def vector_reverse : SDNode<"ISD::VECTOR_REVERSE", SDTVecReverse>;
+def vector_splice : SDNode<"ISD::VECTOR_SPLICE", SDTVecSlice, []>;
 def build_vector : SDNode<"ISD::BUILD_VECTOR", SDTypeProfile<1, -1, []>, []>;
 def splat_vector : SDNode<"ISD::SPLAT_VECTOR", SDTypeProfile<1, 1, []>, []>;
 def scalar_to_vector : SDNode<"ISD::SCALAR_TO_VECTOR", SDTypeProfile<1, 1, []>,
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@ -3208,6 +3208,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
    Results.push_back(Tmp1);
    break;
  }
+  case ISD::VECTOR_SPLICE: {
+    Results.push_back(TLI.expandVectorSplice(Node, DAG));
+    break;
+  }
  case ISD::EXTRACT_ELEMENT: {
    EVT OpTy = Node->getOperand(0).getValueType();
    if (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
@ -4715,7 +4719,14 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
    Results.push_back(Tmp1);
    break;
  }
-
+  case ISD::VECTOR_SPLICE: {
+    Tmp1 = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Node->getOperand(1));
+    Tmp3 = DAG.getNode(ISD::VECTOR_SPLICE, dl, NVT, Tmp1, Tmp2,
+                       Node->getOperand(2));
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp3));
+    break;
+  }
  case ISD::SELECT_CC: {
    SDValue Cond = Node->getOperand(4);
    ISD::CondCode CCCode = cast<CondCodeSDNode>(Cond)->get();
@ -4753,7 +4764,6 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
    Results.push_back(Tmp1);
    break;
  }
-
  case ISD::SETCC:
  case ISD::STRICT_FSETCC:
  case ISD::STRICT_FSETCCS: {
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@ -100,6 +100,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
                         Res = PromoteIntRes_VECTOR_REVERSE(N); break;
  case ISD::VECTOR_SHUFFLE:
                         Res = PromoteIntRes_VECTOR_SHUFFLE(N); break;
+  case ISD::VECTOR_SPLICE:
+                         Res = PromoteIntRes_VECTOR_SPLICE(N); break;
  case ISD::INSERT_VECTOR_ELT:
                         Res = PromoteIntRes_INSERT_VECTOR_ELT(N); break;
  case ISD::BUILD_VECTOR:
@ -4616,6 +4618,15 @@ SDValue DAGTypeLegalizer::ExpandIntOp_ATOMIC_STORE(SDNode *N) {
  return Swap.getValue(1);
 }

+SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SPLICE(SDNode *N) {
+  SDLoc dl(N);
+
+  SDValue V0 = GetPromotedInteger(N->getOperand(0));
+  SDValue V1 = GetPromotedInteger(N->getOperand(1));
+  EVT OutVT = V0.getValueType();
+
+  return DAG.getNode(ISD::VECTOR_SPLICE, dl, OutVT, V0, V1, N->getOperand(2));
+}

 SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) {

--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@ -300,6 +300,7 @@ private:
  SDValue PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N);
  SDValue PromoteIntRes_VECTOR_REVERSE(SDNode *N);
  SDValue PromoteIntRes_VECTOR_SHUFFLE(SDNode *N);
+  SDValue PromoteIntRes_VECTOR_SPLICE(SDNode *N);
  SDValue PromoteIntRes_BUILD_VECTOR(SDNode *N);
  SDValue PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N);
  SDValue PromoteIntRes_SPLAT_VECTOR(SDNode *N);
@ -838,6 +839,7 @@ private:
  void SplitVecRes_VECTOR_REVERSE(SDNode *N, SDValue &Lo, SDValue &Hi);
  void SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo,
                                  SDValue &Hi);
+  void SplitVecRes_VECTOR_SPLICE(SDNode *N, SDValue &Lo, SDValue &Hi);
  void SplitVecRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi);
  void SplitVecRes_FP_TO_XINT_SAT(SDNode *N, SDValue &Lo, SDValue &Hi);

--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@ -947,6 +947,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
  case ISD::VECTOR_SHUFFLE:
    SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi);
    break;
+  case ISD::VECTOR_SPLICE:
+    SplitVecRes_VECTOR_SPLICE(N, Lo, Hi);
+    break;
  case ISD::VAARG:
    SplitVecRes_VAARG(N, Lo, Hi);
    break;
@ -1257,7 +1260,7 @@ void DAGTypeLegalizer::SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo,
  uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
  Hi = DAG.getNode(
      ISD::EXTRACT_SUBVECTOR, dl, HiVT, Vec,
-      DAG.getVectorIdxConstant(IdxVal + LoVT.getVectorNumElements(), dl));
+      DAG.getVectorIdxConstant(IdxVal + LoVT.getVectorMinNumElements(), dl));
 }

 void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
@ -5519,3 +5522,19 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_REVERSE(SDNode *N, SDValue &Lo,
  Lo = DAG.getNode(ISD::VECTOR_REVERSE, DL, InHi.getValueType(), InHi);
  Hi = DAG.getNode(ISD::VECTOR_REVERSE, DL, InLo.getValueType(), InLo);
 }
+
+void DAGTypeLegalizer::SplitVecRes_VECTOR_SPLICE(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+  SDValue Expanded = TLI.expandVectorSplice(N, DAG);
+  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, Expanded,
+                   DAG.getVectorIdxConstant(0, DL));
+  Hi =
+      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, Expanded,
+                  DAG.getVectorIdxConstant(LoVT.getVectorMinNumElements(), DL));
+}
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@ -7105,6 +7105,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
  case Intrinsic::experimental_vector_reverse:
    visitVectorReverse(I);
    return;
+  case Intrinsic::experimental_vector_splice:
+    visitVectorSplice(I);
+    return;
  }
 }

@ -10956,3 +10959,37 @@ void SelectionDAGBuilder::visitFreeze(const FreezeInst &I) {
  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
                           DAG.getVTList(ValueVTs), Values));
 }
+
+void SelectionDAGBuilder::visitVectorSplice(const CallInst &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+
+  SDLoc DL = getCurSDLoc();
+  SDValue V1 = getValue(I.getOperand(0));
+  SDValue V2 = getValue(I.getOperand(1));
+  int64_t Imm = cast<ConstantInt>(I.getOperand(2))->getSExtValue();
+
+  // VECTOR_SHUFFLE doesn't support a scalable mask so use a dedicated node.
+  if (VT.isScalableVector()) {
+    MVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+    setValue(&I, DAG.getNode(ISD::VECTOR_SPLICE, DL, VT, V1, V2,
+                             DAG.getConstant(Imm, DL, IdxVT)));
+    return;
+  }
+
+  unsigned NumElts = VT.getVectorNumElements();
+
+  if ((-Imm > NumElts) || (Imm >= NumElts)) {
+    // Result is undefined if immediate is out-of-bounds.
+    setValue(&I, DAG.getUNDEF(VT));
+    return;
+  }
+
+  uint64_t Idx = (NumElts + Imm) % NumElts;
+
+  // Use VECTOR_SHUFFLE to maintain original behaviour for fixed-length vectors.
+  SmallVector<int, 8> Mask;
+  for (unsigned i = 0; i < NumElts; ++i)
+    Mask.push_back(Idx + i);
+  setValue(&I, DAG.getVectorShuffle(VT, DL, V1, V2, Mask));
+}
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@ -778,6 +778,7 @@ private:

  void visitVectorReduce(const CallInst &I, unsigned Intrinsic);
  void visitVectorReverse(const CallInst &I);
+  void visitVectorSplice(const CallInst &I);

  void visitUserOp1(const Instruction &I) {
    llvm_unreachable("UserOp1 should not exist at instruction selection time!");
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@ -288,6 +288,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
  case ISD::EXTRACT_SUBVECTOR:          return "extract_subvector";
  case ISD::SCALAR_TO_VECTOR:           return "scalar_to_vector";
  case ISD::VECTOR_SHUFFLE:             return "vector_shuffle";
+  case ISD::VECTOR_SPLICE:              return "vector_splice";
  case ISD::SPLAT_VECTOR:               return "splat_vector";
  case ISD::VECTOR_REVERSE:             return "vector_reverse";
  case ISD::CARRY_FALSE:                return "carry_false";
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@ -8625,3 +8625,76 @@ SDValue TargetLowering::expandFP_TO_INT_SAT(SDNode *Node,
  SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
  return DAG.getSelectCC(dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
 }
+
+SDValue TargetLowering::expandVectorSplice(SDNode *Node,
+                                           SelectionDAG &DAG) const {
+  assert(Node->getOpcode() == ISD::VECTOR_SPLICE && "Unexpected opcode!");
+  assert(Node->getValueType(0).isScalableVector() &&
+         "Fixed length vector types expected to use SHUFFLE_VECTOR!");
+
+  EVT VT = Node->getValueType(0);
+  SDValue V1 = Node->getOperand(0);
+  SDValue V2 = Node->getOperand(1);
+  int64_t Imm = cast<ConstantSDNode>(Node->getOperand(2))->getSExtValue();
+  SDLoc DL(Node);
+
+  // Expand through memory thusly:
+  //  Alloca CONCAT_VECTORS_TYPES(V1, V2) Ptr
+  //  Store V1, Ptr
+  //  Store V2, Ptr + sizeof(V1)
+  //  If (Imm < 0)
+  //    TrailingElts = -Imm
+  //    Ptr = Ptr + sizeof(V1) - (TrailingElts * sizeof(VT.Elt))
+  //  else
+  //    Ptr = Ptr + (Imm * sizeof(VT.Elt))
+  //  Res = Load Ptr
+
+  Align Alignment = DAG.getReducedAlign(VT, /*UseABI=*/false);
+
+  EVT MemVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                               VT.getVectorElementCount() * 2);
+  SDValue StackPtr = DAG.CreateStackTemporary(MemVT.getStoreSize(), Alignment);
+  EVT PtrVT = StackPtr.getValueType();
+  auto &MF = DAG.getMachineFunction();
+  auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+  // Store the lo part of CONCAT_VECTORS(V1, V2)
+  SDValue StoreV1 = DAG.getStore(DAG.getEntryNode(), DL, V1, StackPtr, PtrInfo);
+  // Store the hi part of CONCAT_VECTORS(V1, V2)
+  SDValue OffsetToV2 = DAG.getVScale(
+      DL, PtrVT,
+      APInt(PtrVT.getFixedSizeInBits(), VT.getStoreSize().getKnownMinSize()));
+  SDValue StackPtr2 = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, OffsetToV2);
+  SDValue StoreV2 = DAG.getStore(StoreV1, DL, V2, StackPtr2, PtrInfo);
+
+  if (Imm >= 0) {
+    // Load back the required element. getVectorElementPointer takes care of
+    // clamping the index if it's out-of-bounds.
+    StackPtr = getVectorElementPointer(DAG, StackPtr, VT, Node->getOperand(2));
+    // Load the spliced result
+    return DAG.getLoad(VT, DL, StoreV2, StackPtr,
+                       MachinePointerInfo::getUnknownStack(MF));
+  }
+
+  uint64_t TrailingElts = -Imm;
+
+  // NOTE: TrailingElts must be clamped so as not to read outside of V1:V2.
+  TypeSize EltByteSize = VT.getVectorElementType().getStoreSize();
+  SDValue TrailingBytes =
+      DAG.getConstant(TrailingElts * EltByteSize, DL, PtrVT);
+
+  if (TrailingElts > VT.getVectorMinNumElements()) {
+    SDValue VLBytes = DAG.getVScale(
+        DL, PtrVT,
+        APInt(PtrVT.getFixedSizeInBits(), VT.getStoreSize().getKnownMinSize()));
+    TrailingBytes = DAG.getNode(ISD::UMIN, DL, PtrVT, TrailingBytes, VLBytes);
+  }
+
+  // Calculate the start address of the spliced result.
+  StackPtr2 = DAG.getNode(ISD::SUB, DL, PtrVT, StackPtr2, TrailingBytes);
+
+  // Load the spliced result
+  return DAG.getLoad(VT, DL, StoreV2, StackPtr2,
+                     MachinePointerInfo::getUnknownStack(MF));
+}
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@ -849,6 +849,9 @@ void TargetLoweringBase::initActions() {
    setOperationAction(ISD::VECREDUCE_FMIN, VT, Expand);
    setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Expand);
    setOperationAction(ISD::VECREDUCE_SEQ_FMUL, VT, Expand);
+
+    // Named vector shuffles default to expand.
+    setOperationAction(ISD::VECTOR_SPLICE, VT, Expand);
  }

  // Most targets ignore the @llvm.prefetch intrinsic.
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -1108,6 +1108,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
      setOperationAction(ISD::MUL, VT, Custom);
      setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
      setOperationAction(ISD::SELECT, VT, Custom);
+      setOperationAction(ISD::SETCC, VT, Custom);
      setOperationAction(ISD::SDIV, VT, Custom);
      setOperationAction(ISD::UDIV, VT, Custom);
      setOperationAction(ISD::SMIN, VT, Custom);
@ -1276,6 +1277,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
      for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
        setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
    }
+
+    setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
+    setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
+    setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
+    setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
  }

  PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
--- a/test/CodeGen/AArch64/named-vector-shuffles-neon.ll
+++ b/test/CodeGen/AArch64/named-vector-shuffles-neon.ll
@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; VECTOR_SPLICE (index)
+;
+
+define <16 x i8> @splice_v16i8_idx(<16 x i8> %a, <16 x i8> %b) #0 {
+; CHECK-LABEL: splice_v16i8_idx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #1
+; CHECK-NEXT:    ret
+  %res = call <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
+  ret <16 x i8> %res
+}
+
+define <2 x double> @splice_v2f64_idx(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: splice_v2f64_idx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
+; CHECK-NEXT:    ret
+  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 1)
+  ret <2 x double> %res
+}
+
+; Verify promote type legalisation works as expected.
+define <2 x i8> @splice_v2i8_idx(<2 x i8> %a, <2 x i8> %b) #0 {
+; CHECK-LABEL: splice_v2i8_idx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
+; CHECK-NEXT:    ret
+  %res = call <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8> %a, <2 x i8> %b, i32 1)
+  ret <2 x i8> %res
+}
+
+; Verify splitvec type legalisation works as expected.
+define <8 x i32> @splice_v8i32_idx(<8 x i32> %a, <8 x i32> %b) #0 {
+; CHECK-LABEL: splice_v8i32_idx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v0.16b, v1.16b, v2.16b, #4
+; CHECK-NEXT:    ext v1.16b, v2.16b, v3.16b, #4
+; CHECK-NEXT:    ret
+  %res = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 5)
+  ret <8 x i32> %res
+}
+
+; Verify splitvec type legalisation works as expected.
+define <16 x float> @splice_v16f32_idx(<16 x float> %a, <16 x float> %b) #0 {
+; CHECK-LABEL: splice_v16f32_idx:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v0.16b, v1.16b, v2.16b, #12
+; CHECK-NEXT:    ext v1.16b, v2.16b, v3.16b, #12
+; CHECK-NEXT:    ext v2.16b, v3.16b, v4.16b, #12
+; CHECK-NEXT:    ext v3.16b, v4.16b, v5.16b, #12
+; CHECK-NEXT:    ret
+  %res = call <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 7)
+  ret <16 x float> %res
+}
+
+; Verify out-of-bounds index results in undef vector.
+define <2 x double> @splice_v2f64_idx_out_of_bounds(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: splice_v2f64_idx_out_of_bounds:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 2)
+  ret <2 x double> %res
+}
+
+;
+; VECTOR_SPLICE (trailing elements)
+;
+
+define <16 x i8> @splice_v16i8(<16 x i8> %a, <16 x i8> %b) #0 {
+; CHECK-LABEL: splice_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #1
+; CHECK-NEXT:    ret
+  %res = call <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8> %a, <16 x i8> %b, i32 -15)
+  ret <16 x i8> %res
+}
+
+define <2 x double> @splice_v2f64(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: splice_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
+; CHECK-NEXT:    ret
+  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 -1)
+  ret <2 x double> %res
+}
+
+; Verify promote type legalisation works as expected.
+define <2 x i8> @splice_v2i8(<2 x i8> %a, <2 x i8> %b) #0 {
+; CHECK-LABEL: splice_v2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
+; CHECK-NEXT:    ret
+  %res = call <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8> %a, <2 x i8> %b, i32 -1)
+  ret <2 x i8> %res
+}
+
+; Verify splitvec type legalisation works as expected.
+define <8 x i32> @splice_v8i32(<8 x i32> %a, <8 x i32> %b) #0 {
+; CHECK-LABEL: splice_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v0.16b, v1.16b, v2.16b, #4
+; CHECK-NEXT:    ext v1.16b, v2.16b, v3.16b, #4
+; CHECK-NEXT:    ret
+  %res = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 -3)
+  ret <8 x i32> %res
+}
+
+; Verify splitvec type legalisation works as expected.
+define <16 x float> @splice_v16f32(<16 x float> %a, <16 x float> %b) #0 {
+; CHECK-LABEL: splice_v16f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v0.16b, v1.16b, v2.16b, #12
+; CHECK-NEXT:    ext v1.16b, v2.16b, v3.16b, #12
+; CHECK-NEXT:    ext v2.16b, v3.16b, v4.16b, #12
+; CHECK-NEXT:    ext v3.16b, v4.16b, v5.16b, #12
+; CHECK-NEXT:    ret
+  %res = call <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 -9)
+  ret <16 x float> %res
+}
+
+; Verify out-of-bounds trailing element count results in undef vector.
+define <2 x double> @splice_v2f64_out_of_bounds(<2 x double> %a, <2 x double> %b) #0 {
+; CHECK-LABEL: splice_v2f64_out_of_bounds:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 -3)
+  ret <2 x double> %res
+}
+
+declare <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8>, <2 x i8>, i32)
+declare <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8>, <16 x i8>, i32)
+declare <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32>, <8 x i32>, i32)
+declare <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float>, <16 x float>, i32)
+declare <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double>, <2 x double>, i32)
+
+attributes #0 = { nounwind "target-features"="+neon" }
--- a/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
+++ b/test/CodeGen/AArch64/named-vector-shuffles-sve.ll