[IR] Introduce llvm.experimental.vector.splice intrinsic

This patch introduces a new intrinsic @llvm.experimental.vector.splice that constructs a vector of the same type as the two input vectors, based on a immediate where the sign of the immediate distinguishes two variants. A positive immediate specifies an index into the first vector and a negative immediate specifies the number of trailing elements to extract from the first vector. For example: @llvm.experimental.vector.splice(<A,B,C,D>, <E,F,G,H>, 1) ==> <B, C, D, E> ; index @llvm.experimental.vector.splice(<A,B,C,D>, <E,F,G,H>, -3) ==> <B, C, D, E> ; trailing element count These intrinsics support both fixed and scalable vectors, where the former is lowered to a shufflevector to maintain existing behaviour, although while marked as experimental the recommended way to express this operation for fixed-width vectors is to use shufflevector. For scalable vectors where it is not possible to express a shufflevector mask for this operation, a new ISD node has been implemented. This is one of the named shufflevector intrinsics proposed on the mailing-list in the RFC at [1]. Patch by Paul Walker and Cullen Rhodes. [1] https://lists.llvm.org/pipermail/llvm-dev/2020-November/146864.html Reviewed By: sdesmalen Differential Revision: https://reviews.llvm.org/D94708
2024-11-22 10:42:39 +01:00 · 2021-01-08 14:06:13 +00:00 · 2021-01-08 14:06:13 +00:00 · 6682076a17
commit 6682076a17
parent ee05374523
17 changed files with 1691 additions and 3 deletions
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@ -16510,6 +16510,52 @@ Arguments:
 The argument to this intrinsic must be a vector.
 '``llvm.experimental.vector.splice``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 Syntax:
 """""""
 This is an overloaded intrinsic.
 ::
      declare <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %vec1, <2 x double> %vec2, i32 %imm)
      declare <vscale x 4 x i32> @llvm.experimental.vector.splice.nxv4i32(<vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2, i32 %imm)
 Overview:
 """""""""
 The '``llvm.experimental.vector.splice.*``' intrinsics construct a vector by
 concatenating elements from the first input vector with elements of the second
 input vector, returning a vector of the same type as the input vectors. The
 signed immediate, modulo the number of elements in the vector, is the index
 into the first vector from which to extract the result value. This means
 conceptually that for a positive immediate, a vector is extracted from
 ``concat(%vec1, %vec2)`` starting at index ``imm``, whereas for a negative
 immediate, it extracts ``-imm`` trailing elements from the first vector, and
 the remaining elements from ``%vec2``.
 These intrinsics work for both fixed and scalable vectors. While this intrinsic
 is marked as experimental, the recommended way to express this operation for
 fixed-width vectors is still to use a shufflevector, as that may allow for more
 optimization opportunities.
 For example:
 .. code-block:: text
 llvm.experimental.vector.splice(<A,B,C,D>, <E,F,G,H>, 1)  ==> <B, C, D, E> ; index
 llvm.experimental.vector.splice(<A,B,C,D>, <E,F,G,H>, -3) ==> <B, C, D, E> ; trailing elements
 Arguments:
 """"""""""
 The first two operands are vectors with the same type. The third argument
 ``imm`` is the start index, modulo VL, where VL is the runtime vector length of
 the source/result vector. The ``imm`` is a signed integer constant in the range
 ``-VL <= imm < VL``. For values outside of this range the result is poison.
 Matrix Intrinsics
 -----------------
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@ -556,6 +556,18 @@ enum NodeType {
  /// in terms of the element size of VEC1/VEC2, not in terms of bytes.
  VECTOR_SHUFFLE,
  /// VECTOR_SPLICE(VEC1, VEC2, IMM) - Returns a subvector of the same type as
  /// VEC1/VEC2 from CONCAT_VECTORS(VEC1, VEC2), based on the IMM in two ways.
  /// Let the result type be T, if IMM is positive it represents the starting
  /// element number (an index) from which a subvector of type T is extracted
  /// from CONCAT_VECTORS(VEC1, VEC2). If IMM is negative it represents a count
  /// specifying the number of trailing elements to extract from VEC1, where the
  /// elements of T are selected using the following algorithm:
  ///   RESULT[i] = CONCAT_VECTORS(VEC1,VEC2)[VEC1.ElementCount - ABS(IMM) + i]
  /// If IMM is not in the range [-VL, VL-1] the result vector is undefined. IMM
  /// is a constant integer.
  VECTOR_SPLICE,
  /// SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a
  /// scalar value into element 0 of the resultant vector type.  The top
  /// elements 1 to N-1 of the N-element vector are undefined.  The type
--- a/include/llvm/CodeGen/TargetLowering.h
+++ b/include/llvm/CodeGen/TargetLowering.h
@ -4511,6 +4511,10 @@ public:
  /// Returns true if the expansion was successful.
  bool expandREM(SDNode *Node, SDValue &Result, SelectionDAG &DAG) const;
  /// Method for building the DAG expansion of ISD::VECTOR_SPLICE. This
  /// method accepts vectors as its arguments.
  SDValue expandVectorSplice(SDNode *Node, SelectionDAG &DAG) const;
  //===--------------------------------------------------------------------===//
  // Instruction Emitting Hooks
  //
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@ -1659,6 +1659,13 @@ def int_experimental_vector_extract : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                                                            [llvm_anyvector_ty, llvm_i64_ty],
                                                            [IntrNoMem, ImmArg<ArgIndex<1>>]>;
 //===---------- Named shufflevector intrinsics ------===//
 def int_experimental_vector_splice : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                                                           [LLVMMatchType<0>,
                                                            LLVMMatchType<0>,
                                                            llvm_i32_ty],
                                                           [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 //===----------------------------------------------------------------------===//
 //===----------------------------------------------------------------------===//
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@ -241,6 +241,9 @@ def SDTMaskedLoad: SDTypeProfile<1, 4, [       // masked load
 def SDTVecShuffle : SDTypeProfile<1, 2, [
  SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
 ]>;
 def SDTVecSlice : SDTypeProfile<1, 3, [     // vector splice
  SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisInt<3>
 ]>;
 def SDTVecExtract : SDTypeProfile<1, 2, [   // vector extract
  SDTCisEltOfVec<0, 1>, SDTCisPtrTy<2>
 ]>;
@ -655,6 +658,7 @@ def ist        : SDNode<"ISD::STORE"      , SDTIStore,
 def vector_shuffle : SDNode<"ISD::VECTOR_SHUFFLE", SDTVecShuffle, []>;
 def vector_reverse : SDNode<"ISD::VECTOR_REVERSE", SDTVecReverse>;
 def vector_splice : SDNode<"ISD::VECTOR_SPLICE", SDTVecSlice, []>;
 def build_vector : SDNode<"ISD::BUILD_VECTOR", SDTypeProfile<1, -1, []>, []>;
 def splat_vector : SDNode<"ISD::SPLAT_VECTOR", SDTypeProfile<1, 1, []>, []>;
 def scalar_to_vector : SDNode<"ISD::SCALAR_TO_VECTOR", SDTypeProfile<1, 1, []>,
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@ -3208,6 +3208,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
    Results.push_back(Tmp1);
    break;
  }
  case ISD::VECTOR_SPLICE: {
    Results.push_back(TLI.expandVectorSplice(Node, DAG));
    break;
  }
  case ISD::EXTRACT_ELEMENT: {
    EVT OpTy = Node->getOperand(0).getValueType();
    if (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
@ -4715,7 +4719,14 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
    Results.push_back(Tmp1);
    break;
  }
-
+  case ISD::VECTOR_SPLICE: {
    Tmp1 = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Node->getOperand(0));
    Tmp2 = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Node->getOperand(1));
    Tmp3 = DAG.getNode(ISD::VECTOR_SPLICE, dl, NVT, Tmp1, Tmp2,
                       Node->getOperand(2));
    Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp3));
    break;
  }
  case ISD::SELECT_CC: {
    SDValue Cond = Node->getOperand(4);
    ISD::CondCode CCCode = cast<CondCodeSDNode>(Cond)->get();
@ -4753,7 +4764,6 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
    Results.push_back(Tmp1);
    break;
  }
  case ISD::SETCC:
  case ISD::STRICT_FSETCC:
  case ISD::STRICT_FSETCCS: {
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@ -100,6 +100,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
                         Res = PromoteIntRes_VECTOR_REVERSE(N); break;
  case ISD::VECTOR_SHUFFLE:
                         Res = PromoteIntRes_VECTOR_SHUFFLE(N); break;
  case ISD::VECTOR_SPLICE:
                         Res = PromoteIntRes_VECTOR_SPLICE(N); break;
  case ISD::INSERT_VECTOR_ELT:
                         Res = PromoteIntRes_INSERT_VECTOR_ELT(N); break;
  case ISD::BUILD_VECTOR:
@ -4616,6 +4618,15 @@ SDValue DAGTypeLegalizer::ExpandIntOp_ATOMIC_STORE(SDNode *N) {
  return Swap.getValue(1);
 }
 SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SPLICE(SDNode *N) {
  SDLoc dl(N);
  SDValue V0 = GetPromotedInteger(N->getOperand(0));
  SDValue V1 = GetPromotedInteger(N->getOperand(1));
  EVT OutVT = V0.getValueType();
  return DAG.getNode(ISD::VECTOR_SPLICE, dl, OutVT, V0, V1, N->getOperand(2));
 }
 SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) {
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@ -300,6 +300,7 @@ private:
  SDValue PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N);
  SDValue PromoteIntRes_VECTOR_REVERSE(SDNode *N);
  SDValue PromoteIntRes_VECTOR_SHUFFLE(SDNode *N);
  SDValue PromoteIntRes_VECTOR_SPLICE(SDNode *N);
  SDValue PromoteIntRes_BUILD_VECTOR(SDNode *N);
  SDValue PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N);
  SDValue PromoteIntRes_SPLAT_VECTOR(SDNode *N);
@ -838,6 +839,7 @@ private:
  void SplitVecRes_VECTOR_REVERSE(SDNode *N, SDValue &Lo, SDValue &Hi);
  void SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo,
                                  SDValue &Hi);
  void SplitVecRes_VECTOR_SPLICE(SDNode *N, SDValue &Lo, SDValue &Hi);
  void SplitVecRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi);
  void SplitVecRes_FP_TO_XINT_SAT(SDNode *N, SDValue &Lo, SDValue &Hi);
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@ -947,6 +947,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
  case ISD::VECTOR_SHUFFLE:
    SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi);
    break;
  case ISD::VECTOR_SPLICE:
    SplitVecRes_VECTOR_SPLICE(N, Lo, Hi);
    break;
  case ISD::VAARG:
    SplitVecRes_VAARG(N, Lo, Hi);
    break;
@ -1257,7 +1260,7 @@ void DAGTypeLegalizer::SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo,
  uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
  Hi = DAG.getNode(
      ISD::EXTRACT_SUBVECTOR, dl, HiVT, Vec,
-      DAG.getVectorIdxConstant(IdxVal + LoVT.getVectorNumElements(), dl));
+      DAG.getVectorIdxConstant(IdxVal + LoVT.getVectorMinNumElements(), dl));
 }
 void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
@ -5519,3 +5522,19 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_REVERSE(SDNode *N, SDValue &Lo,
  Lo = DAG.getNode(ISD::VECTOR_REVERSE, DL, InHi.getValueType(), InHi);
  Hi = DAG.getNode(ISD::VECTOR_REVERSE, DL, InLo.getValueType(), InLo);
 }
 void DAGTypeLegalizer::SplitVecRes_VECTOR_SPLICE(SDNode *N, SDValue &Lo,
                                                 SDValue &Hi) {
  EVT VT = N->getValueType(0);
  SDLoc DL(N);
  EVT LoVT, HiVT;
  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
  SDValue Expanded = TLI.expandVectorSplice(N, DAG);
  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, Expanded,
                   DAG.getVectorIdxConstant(0, DL));
  Hi =
      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, Expanded,
                  DAG.getVectorIdxConstant(LoVT.getVectorMinNumElements(), DL));
 }
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@ -7105,6 +7105,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
  case Intrinsic::experimental_vector_reverse:
    visitVectorReverse(I);
    return;
  case Intrinsic::experimental_vector_splice:
    visitVectorSplice(I);
    return;
  }
 }
@ -10956,3 +10959,37 @@ void SelectionDAGBuilder::visitFreeze(const FreezeInst &I) {
  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
                           DAG.getVTList(ValueVTs), Values));
 }
 void SelectionDAGBuilder::visitVectorSplice(const CallInst &I) {
  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
  EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
  SDLoc DL = getCurSDLoc();
  SDValue V1 = getValue(I.getOperand(0));
  SDValue V2 = getValue(I.getOperand(1));
  int64_t Imm = cast<ConstantInt>(I.getOperand(2))->getSExtValue();
  // VECTOR_SHUFFLE doesn't support a scalable mask so use a dedicated node.
  if (VT.isScalableVector()) {
    MVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
    setValue(&I, DAG.getNode(ISD::VECTOR_SPLICE, DL, VT, V1, V2,
                             DAG.getConstant(Imm, DL, IdxVT)));
    return;
  }
  unsigned NumElts = VT.getVectorNumElements();
  if ((-Imm > NumElts) || (Imm >= NumElts)) {
    // Result is undefined if immediate is out-of-bounds.
    setValue(&I, DAG.getUNDEF(VT));
    return;
  }
  uint64_t Idx = (NumElts + Imm) % NumElts;
  // Use VECTOR_SHUFFLE to maintain original behaviour for fixed-length vectors.
  SmallVector<int, 8> Mask;
  for (unsigned i = 0; i < NumElts; ++i)
    Mask.push_back(Idx + i);
  setValue(&I, DAG.getVectorShuffle(VT, DL, V1, V2, Mask));
 }
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@ -778,6 +778,7 @@ private:
  void visitVectorReduce(const CallInst &I, unsigned Intrinsic);
  void visitVectorReverse(const CallInst &I);
  void visitVectorSplice(const CallInst &I);
  void visitUserOp1(const Instruction &I) {
    llvm_unreachable("UserOp1 should not exist at instruction selection time!");
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@ -288,6 +288,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
  case ISD::EXTRACT_SUBVECTOR:          return "extract_subvector";
  case ISD::SCALAR_TO_VECTOR:           return "scalar_to_vector";
  case ISD::VECTOR_SHUFFLE:             return "vector_shuffle";
  case ISD::VECTOR_SPLICE:              return "vector_splice";
  case ISD::SPLAT_VECTOR:               return "splat_vector";
  case ISD::VECTOR_REVERSE:             return "vector_reverse";
  case ISD::CARRY_FALSE:                return "carry_false";
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@ -8625,3 +8625,76 @@ SDValue TargetLowering::expandFP_TO_INT_SAT(SDNode *Node,
  SDValue ZeroInt = DAG.getConstant(0, dl, DstVT);
  return DAG.getSelectCC(dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO);
 }
 SDValue TargetLowering::expandVectorSplice(SDNode *Node,
                                           SelectionDAG &DAG) const {
  assert(Node->getOpcode() == ISD::VECTOR_SPLICE && "Unexpected opcode!");
  assert(Node->getValueType(0).isScalableVector() &&
         "Fixed length vector types expected to use SHUFFLE_VECTOR!");
  EVT VT = Node->getValueType(0);
  SDValue V1 = Node->getOperand(0);
  SDValue V2 = Node->getOperand(1);
  int64_t Imm = cast<ConstantSDNode>(Node->getOperand(2))->getSExtValue();
  SDLoc DL(Node);
  // Expand through memory thusly:
  //  Alloca CONCAT_VECTORS_TYPES(V1, V2) Ptr
  //  Store V1, Ptr
  //  Store V2, Ptr + sizeof(V1)
  //  If (Imm < 0)
  //    TrailingElts = -Imm
  //    Ptr = Ptr + sizeof(V1) - (TrailingElts * sizeof(VT.Elt))
  //  else
  //    Ptr = Ptr + (Imm * sizeof(VT.Elt))
  //  Res = Load Ptr
  Align Alignment = DAG.getReducedAlign(VT, /*UseABI=*/false);
  EVT MemVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
                               VT.getVectorElementCount() * 2);
  SDValue StackPtr = DAG.CreateStackTemporary(MemVT.getStoreSize(), Alignment);
  EVT PtrVT = StackPtr.getValueType();
  auto &MF = DAG.getMachineFunction();
  auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
  auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
  // Store the lo part of CONCAT_VECTORS(V1, V2)
  SDValue StoreV1 = DAG.getStore(DAG.getEntryNode(), DL, V1, StackPtr, PtrInfo);
  // Store the hi part of CONCAT_VECTORS(V1, V2)
  SDValue OffsetToV2 = DAG.getVScale(
      DL, PtrVT,
      APInt(PtrVT.getFixedSizeInBits(), VT.getStoreSize().getKnownMinSize()));
  SDValue StackPtr2 = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, OffsetToV2);
  SDValue StoreV2 = DAG.getStore(StoreV1, DL, V2, StackPtr2, PtrInfo);
  if (Imm >= 0) {
    // Load back the required element. getVectorElementPointer takes care of
    // clamping the index if it's out-of-bounds.
    StackPtr = getVectorElementPointer(DAG, StackPtr, VT, Node->getOperand(2));
    // Load the spliced result
    return DAG.getLoad(VT, DL, StoreV2, StackPtr,
                       MachinePointerInfo::getUnknownStack(MF));
  }
  uint64_t TrailingElts = -Imm;
  // NOTE: TrailingElts must be clamped so as not to read outside of V1:V2.
  TypeSize EltByteSize = VT.getVectorElementType().getStoreSize();
  SDValue TrailingBytes =
      DAG.getConstant(TrailingElts * EltByteSize, DL, PtrVT);
  if (TrailingElts > VT.getVectorMinNumElements()) {
    SDValue VLBytes = DAG.getVScale(
        DL, PtrVT,
        APInt(PtrVT.getFixedSizeInBits(), VT.getStoreSize().getKnownMinSize()));
    TrailingBytes = DAG.getNode(ISD::UMIN, DL, PtrVT, TrailingBytes, VLBytes);
  }
  // Calculate the start address of the spliced result.
  StackPtr2 = DAG.getNode(ISD::SUB, DL, PtrVT, StackPtr2, TrailingBytes);
  // Load the spliced result
  return DAG.getLoad(VT, DL, StoreV2, StackPtr2,
                     MachinePointerInfo::getUnknownStack(MF));
 }
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@ -849,6 +849,9 @@ void TargetLoweringBase::initActions() {
    setOperationAction(ISD::VECREDUCE_FMIN, VT, Expand);
    setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Expand);
    setOperationAction(ISD::VECREDUCE_SEQ_FMUL, VT, Expand);
    // Named vector shuffles default to expand.
    setOperationAction(ISD::VECTOR_SPLICE, VT, Expand);
  }
  // Most targets ignore the @llvm.prefetch intrinsic.
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -1108,6 +1108,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
      setOperationAction(ISD::MUL, VT, Custom);
      setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
      setOperationAction(ISD::SELECT, VT, Custom);
      setOperationAction(ISD::SETCC, VT, Custom);
      setOperationAction(ISD::SDIV, VT, Custom);
      setOperationAction(ISD::UDIV, VT, Custom);
      setOperationAction(ISD::SMIN, VT, Custom);
@ -1276,6 +1277,11 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
      for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
        setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
    }
    setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv2i1, MVT::nxv2i64);
    setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv4i1, MVT::nxv4i32);
    setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv8i1, MVT::nxv8i16);
    setOperationPromotedToType(ISD::VECTOR_SPLICE, MVT::nxv16i1, MVT::nxv16i8);
  }
  PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
--- a/test/CodeGen/AArch64/named-vector-shuffles-neon.ll
+++ b/test/CodeGen/AArch64/named-vector-shuffles-neon.ll
@ -0,0 +1,142 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs < %s | FileCheck %s
 target triple = "aarch64-unknown-linux-gnu"
 ;
 ; VECTOR_SPLICE (index)
 ;
 define <16 x i8> @splice_v16i8_idx(<16 x i8> %a, <16 x i8> %b) #0 {
 ; CHECK-LABEL: splice_v16i8_idx:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #1
 ; CHECK-NEXT:    ret
  %res = call <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8> %a, <16 x i8> %b, i32 1)
  ret <16 x i8> %res
 }
 define <2 x double> @splice_v2f64_idx(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: splice_v2f64_idx:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
 ; CHECK-NEXT:    ret
  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 1)
  ret <2 x double> %res
 }
 ; Verify promote type legalisation works as expected.
 define <2 x i8> @splice_v2i8_idx(<2 x i8> %a, <2 x i8> %b) #0 {
 ; CHECK-LABEL: splice_v2i8_idx:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
 ; CHECK-NEXT:    ret
  %res = call <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8> %a, <2 x i8> %b, i32 1)
  ret <2 x i8> %res
 }
 ; Verify splitvec type legalisation works as expected.
 define <8 x i32> @splice_v8i32_idx(<8 x i32> %a, <8 x i32> %b) #0 {
 ; CHECK-LABEL: splice_v8i32_idx:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v1.16b, v2.16b, #4
 ; CHECK-NEXT:    ext v1.16b, v2.16b, v3.16b, #4
 ; CHECK-NEXT:    ret
  %res = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 5)
  ret <8 x i32> %res
 }
 ; Verify splitvec type legalisation works as expected.
 define <16 x float> @splice_v16f32_idx(<16 x float> %a, <16 x float> %b) #0 {
 ; CHECK-LABEL: splice_v16f32_idx:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v1.16b, v2.16b, #12
 ; CHECK-NEXT:    ext v1.16b, v2.16b, v3.16b, #12
 ; CHECK-NEXT:    ext v2.16b, v3.16b, v4.16b, #12
 ; CHECK-NEXT:    ext v3.16b, v4.16b, v5.16b, #12
 ; CHECK-NEXT:    ret
  %res = call <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 7)
  ret <16 x float> %res
 }
 ; Verify out-of-bounds index results in undef vector.
 define <2 x double> @splice_v2f64_idx_out_of_bounds(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: splice_v2f64_idx_out_of_bounds:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 2)
  ret <2 x double> %res
 }
 ;
 ; VECTOR_SPLICE (trailing elements)
 ;
 define <16 x i8> @splice_v16i8(<16 x i8> %a, <16 x i8> %b) #0 {
 ; CHECK-LABEL: splice_v16i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #1
 ; CHECK-NEXT:    ret
  %res = call <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8> %a, <16 x i8> %b, i32 -15)
  ret <16 x i8> %res
 }
 define <2 x double> @splice_v2f64(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: splice_v2f64:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v0.16b, v1.16b, #8
 ; CHECK-NEXT:    ret
  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 -1)
  ret <2 x double> %res
 }
 ; Verify promote type legalisation works as expected.
 define <2 x i8> @splice_v2i8(<2 x i8> %a, <2 x i8> %b) #0 {
 ; CHECK-LABEL: splice_v2i8:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.8b, v0.8b, v1.8b, #4
 ; CHECK-NEXT:    ret
  %res = call <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8> %a, <2 x i8> %b, i32 -1)
  ret <2 x i8> %res
 }
 ; Verify splitvec type legalisation works as expected.
 define <8 x i32> @splice_v8i32(<8 x i32> %a, <8 x i32> %b) #0 {
 ; CHECK-LABEL: splice_v8i32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v1.16b, v2.16b, #4
 ; CHECK-NEXT:    ext v1.16b, v2.16b, v3.16b, #4
 ; CHECK-NEXT:    ret
  %res = call <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32> %a, <8 x i32> %b, i32 -3)
  ret <8 x i32> %res
 }
 ; Verify splitvec type legalisation works as expected.
 define <16 x float> @splice_v16f32(<16 x float> %a, <16 x float> %b) #0 {
 ; CHECK-LABEL: splice_v16f32:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ext v0.16b, v1.16b, v2.16b, #12
 ; CHECK-NEXT:    ext v1.16b, v2.16b, v3.16b, #12
 ; CHECK-NEXT:    ext v2.16b, v3.16b, v4.16b, #12
 ; CHECK-NEXT:    ext v3.16b, v4.16b, v5.16b, #12
 ; CHECK-NEXT:    ret
  %res = call <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float> %a, <16 x float> %b, i32 -9)
  ret <16 x float> %res
 }
 ; Verify out-of-bounds trailing element count results in undef vector.
 define <2 x double> @splice_v2f64_out_of_bounds(<2 x double> %a, <2 x double> %b) #0 {
 ; CHECK-LABEL: splice_v2f64_out_of_bounds:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
  %res = call <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double> %a, <2 x double> %b, i32 -3)
  ret <2 x double> %res
 }
 declare <2 x i8> @llvm.experimental.vector.splice.v2i8(<2 x i8>, <2 x i8>, i32)
 declare <16 x i8> @llvm.experimental.vector.splice.v16i8(<16 x i8>, <16 x i8>, i32)
 declare <8 x i32> @llvm.experimental.vector.splice.v8i32(<8 x i32>, <8 x i32>, i32)
 declare <16 x float> @llvm.experimental.vector.splice.v16f32(<16 x float>, <16 x float>, i32)
 declare <2 x double> @llvm.experimental.vector.splice.v2f64(<2 x double>, <2 x double>, i32)
 attributes #0 = { nounwind "target-features"="+neon" }
--- a/test/CodeGen/AArch64/named-vector-shuffles-sve.ll
+++ b/test/CodeGen/AArch64/named-vector-shuffles-sve.ll