[CodeGen][SelectionDAG]Add new intrinsic experimental.vector.reverse

This patch adds a new intrinsic experimental.vector.reduce that takes a single vector and returns a vector of matching type but with the original lane order reversed. For example: ``` vector.reverse(<A,B,C,D>) ==> <D,C,B,A> ``` The new intrinsic supports fixed and scalable vectors types. The fixed-width vector relies on shufflevector to maintain existing behaviour. Scalable vector uses the new ISD node - VECTOR_REVERSE. This new intrinsic is one of the named shufflevector intrinsics proposed on the mailing-list in the RFC at [1]. Patch by Paul Walker (@paulwalker-arm). [1] https://lists.llvm.org/pipermail/llvm-dev/2020-November/146864.html Differential Revision: https://reviews.llvm.org/D94883
2024-11-24 19:52:54 +01:00 · 2021-01-15 16:46:42 +00:00 · 2021-01-15 16:46:42 +00:00 · 9ea32f75fa
commit 9ea32f75fa
parent b6252362a0
19 changed files with 729 additions and 9 deletions
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@ -16233,6 +16233,33 @@ runtime, then the result vector is undefined. The ``idx`` parameter must be a
 vector index constant type (for most targets this will be an integer pointer
 type).

+'``llvm.experimental.vector.reverse``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8> %a)
+      declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.vector.reverse.*``' intrinsics reverse a vector.
+The intrinsic takes a single vector and returns a vector of matching type but
+with the original lane order reversed. These intrinsics work for both fixed
+and scalable vectors. While this intrinsic is marked as experimental the
+recommended way to express reverse operations for fixed-width vectors is still
+to use a shufflevector, as that may allow for more optimization opportunities.
+
+Arguments:
+""""""""""
+
+The argument to this intrinsic must be a vector.
+
 Matrix Intrinsics
 -----------------

--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@ -540,6 +540,11 @@ enum NodeType {
  /// vector, but not the other way around.
  EXTRACT_SUBVECTOR,

+  /// VECTOR_REVERSE(VECTOR) - Returns a vector, of the same type as VECTOR,
+  /// whose elements are shuffled using the following algorithm:
+  ///   RESULT[i] = VECTOR[VECTOR.ElementCount - 1 - i]
+  VECTOR_REVERSE,
+
  /// VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as
  /// VEC1/VEC2.  A VECTOR_SHUFFLE node also contains an array of constant int
  /// values that indicate which value (or undef) each result element will
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@ -1635,6 +1635,12 @@ def int_preserve_struct_access_index : DefaultAttrsIntrinsic<[llvm_anyptr_ty],
                                                  ImmArg<ArgIndex<1>>,
                                                  ImmArg<ArgIndex<2>>]>;

+//===------------ Intrinsics to perform common vector shuffles ------------===//
+
+def int_experimental_vector_reverse : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                   [LLVMMatchType<0>],
+                                   [IntrNoMem]>;
+
 //===---------- Intrinsics to query properties of scalable vectors --------===//
 def int_vscale : DefaultAttrsIntrinsic<[llvm_anyint_ty], [], [IntrNoMem]>;

--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@ -254,6 +254,9 @@ def SDTFPVecReduce : SDTypeProfile<1, 1, [  // FP vector reduction
  SDTCisFP<0>, SDTCisVec<1>
 ]>;

+def SDTVecReverse : SDTypeProfile<1, 1, [  // vector reverse
+  SDTCisVec<0>, SDTCisSameAs<0,1>
+]>;

 def SDTSubVecExtract : SDTypeProfile<1, 2, [// subvector extract
  SDTCisSubVecOfVec<0,1>, SDTCisInt<2>
@ -651,6 +654,7 @@ def ist        : SDNode<"ISD::STORE"      , SDTIStore,
                        [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;

 def vector_shuffle : SDNode<"ISD::VECTOR_SHUFFLE", SDTVecShuffle, []>;
+def vector_reverse : SDNode<"ISD::VECTOR_REVERSE", SDTVecReverse>;
 def build_vector : SDNode<"ISD::BUILD_VECTOR", SDTypeProfile<1, -1, []>, []>;
 def splat_vector : SDNode<"ISD::SPLAT_VECTOR", SDTypeProfile<1, 1, []>, []>;
 def scalar_to_vector : SDNode<"ISD::SCALAR_TO_VECTOR", SDTypeProfile<1, 1, []>,
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@ -5373,6 +5373,12 @@ static Value *simplifyUnaryIntrinsic(Function *F, Value *Op0,
      return Op0;
    break;
  }
+  case Intrinsic::experimental_vector_reverse:
+    // experimental.vector.reverse(experimental.vector.reverse(x)) -> x
+    if (match(Op0,
+              m_Intrinsic<Intrinsic::experimental_vector_reverse>(m_Value(X))))
+      return X;
+    break;
  default:
    break;
  }
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@ -96,6 +96,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {

  case ISD::EXTRACT_SUBVECTOR:
                         Res = PromoteIntRes_EXTRACT_SUBVECTOR(N); break;
+  case ISD::VECTOR_REVERSE:
+                         Res = PromoteIntRes_VECTOR_REVERSE(N); break;
  case ISD::VECTOR_SHUFFLE:
                         Res = PromoteIntRes_VECTOR_SHUFFLE(N); break;
  case ISD::INSERT_VECTOR_ELT:
@ -4662,6 +4664,14 @@ SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) {
  return DAG.getBuildVector(NOutVT, dl, Ops);
 }

+SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_REVERSE(SDNode *N) {
+  SDLoc dl(N);
+
+  SDValue V0 = GetPromotedInteger(N->getOperand(0));
+  EVT OutVT = V0.getValueType();
+
+  return DAG.getNode(ISD::VECTOR_REVERSE, dl, OutVT, V0);
+}

 SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SHUFFLE(SDNode *N) {
  ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N);
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@ -298,6 +298,7 @@ private:
  SDValue PromoteIntRes_Atomic1(AtomicSDNode *N);
  SDValue PromoteIntRes_AtomicCmpSwap(AtomicSDNode *N, unsigned ResNo);
  SDValue PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue PromoteIntRes_VECTOR_REVERSE(SDNode *N);
  SDValue PromoteIntRes_VECTOR_SHUFFLE(SDNode *N);
  SDValue PromoteIntRes_BUILD_VECTOR(SDNode *N);
  SDValue PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N);
@ -834,6 +835,7 @@ private:
  void SplitVecRes_MGATHER(MaskedGatherSDNode *MGT, SDValue &Lo, SDValue &Hi);
  void SplitVecRes_ScalarOp(SDNode *N, SDValue &Lo, SDValue &Hi);
  void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_VECTOR_REVERSE(SDNode *N, SDValue &Lo, SDValue &Hi);
  void SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo,
                                  SDValue &Hi);
  void SplitVecRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi);
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@ -930,6 +930,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
  case ISD::SETCC:
    SplitVecRes_SETCC(N, Lo, Hi);
    break;
+  case ISD::VECTOR_REVERSE:
+    SplitVecRes_VECTOR_REVERSE(N, Lo, Hi);
+    break;
  case ISD::VECTOR_SHUFFLE:
    SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi);
    break;
@ -5492,3 +5495,13 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
    Ops[Idx] = FillVal;
  return DAG.getBuildVector(NVT, dl, Ops);
 }
+
+void DAGTypeLegalizer::SplitVecRes_VECTOR_REVERSE(SDNode *N, SDValue &Lo,
+                                                  SDValue &Hi) {
+  SDValue InLo, InHi;
+  GetSplitVector(N->getOperand(0), InLo, InHi);
+  SDLoc DL(N);
+
+  Lo = DAG.getNode(ISD::VECTOR_REVERSE, DL, InHi.getValueType(), InHi);
+  Hi = DAG.getNode(ISD::VECTOR_REVERSE, DL, InLo.getValueType(), InLo);
+}
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@ -7025,6 +7025,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
    setValue(&I, DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResultVT, Vec, Index));
    return;
  }
+  case Intrinsic::experimental_vector_reverse:
+    visitVectorReverse(I);
+    return;
  }
 }

@ -10836,6 +10839,29 @@ void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
  }
 }

+void SelectionDAGBuilder::visitVectorReverse(const CallInst &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+
+  SDLoc DL = getCurSDLoc();
+  SDValue V = getValue(I.getOperand(0));
+  assert(VT == V.getValueType() && "Malformed vector.reverse!");
+
+  if (VT.isScalableVector()) {
+    setValue(&I, DAG.getNode(ISD::VECTOR_REVERSE, DL, VT, V));
+    return;
+  }
+
+  // Use VECTOR_SHUFFLE for the fixed-length vector
+  // to maintain existing behavior.
+  SmallVector<int, 8> Mask;
+  unsigned NumElts = VT.getVectorMinNumElements();
+  for (unsigned i = 0; i != NumElts; ++i)
+    Mask.push_back(NumElts - 1 - i);
+
+  setValue(&I, DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), Mask));
+}
+
 void SelectionDAGBuilder::visitFreeze(const FreezeInst &I) {
  SmallVector<EVT, 4> ValueVTs;
  ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), I.getType(),
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@ -773,6 +773,7 @@ private:
  void visitGCResult(const GCResultInst &I);

  void visitVectorReduce(const CallInst &I, unsigned Intrinsic);
+  void visitVectorReverse(const CallInst &I);

  void visitUserOp1(const Instruction &I) {
    llvm_unreachable("UserOp1 should not exist at instruction selection time!");
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@ -289,6 +289,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
  case ISD::SCALAR_TO_VECTOR:           return "scalar_to_vector";
  case ISD::VECTOR_SHUFFLE:             return "vector_shuffle";
  case ISD::SPLAT_VECTOR:               return "splat_vector";
+  case ISD::VECTOR_REVERSE:             return "vector_reverse";
  case ISD::CARRY_FALSE:                return "carry_false";
  case ISD::ADDC:                       return "addc";
  case ISD::ADDE:                       return "adde";
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@ -3894,7 +3894,7 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
      return false;

    // Vectors (of > 1 lane) in big endian need tricky handling.
-    if (RVEVT.isVector() && RVEVT.getVectorNumElements() > 1 &&
+    if (RVEVT.isVector() && RVEVT.getVectorElementCount().isVector() &&
        !Subtarget->isLittleEndian())
      return false;

--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@ -1853,7 +1853,6 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
    MAKE_CASE(AArch64ISD::CLASTB_N)
    MAKE_CASE(AArch64ISD::LASTA)
    MAKE_CASE(AArch64ISD::LASTB)
-    MAKE_CASE(AArch64ISD::REV)
    MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
    MAKE_CASE(AArch64ISD::TBL)
    MAKE_CASE(AArch64ISD::FADD_PRED)
@ -3594,7 +3593,7 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
    return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
                       Op.getOperand(1), Op.getOperand(2));
  case Intrinsic::aarch64_sve_rev:
-    return DAG.getNode(AArch64ISD::REV, dl, Op.getValueType(),
+    return DAG.getNode(ISD::VECTOR_REVERSE, dl, Op.getValueType(),
                       Op.getOperand(1));
  case Intrinsic::aarch64_sve_tbl:
    return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@ -292,7 +292,6 @@ enum NodeType : unsigned {
  CLASTB_N,
  LASTA,
  LASTB,
-  REV,
  TBL,

  // Floating-point reductions.
--- a/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/lib/Target/AArch64/AArch64SVEInstrInfo.td
@ -249,9 +249,6 @@ def AArch64clasta_n   : SDNode<"AArch64ISD::CLASTA_N",   SDT_AArch64ReduceWithIn
 def AArch64clastb_n   : SDNode<"AArch64ISD::CLASTB_N",   SDT_AArch64ReduceWithInit>;
 def AArch64fadda_p    : SDNode<"AArch64ISD::FADDA_PRED", SDT_AArch64ReduceWithInit>;

-def SDT_AArch64Rev   : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
-def AArch64rev       : SDNode<"AArch64ISD::REV", SDT_AArch64Rev>;
-
 def SDT_AArch64PTest : SDTypeProfile<0, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
 def AArch64ptest     : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>;

@ -587,8 +584,8 @@ let Predicates = [HasSVE] in {
  defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", int_aarch64_sve_revh>;
  defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw", int_aarch64_sve_revw>;

-  defm REV_PP : sve_int_perm_reverse_p<"rev", AArch64rev>;
-  defm REV_ZZ : sve_int_perm_reverse_z<"rev", AArch64rev>;
+  defm REV_PP : sve_int_perm_reverse_p<"rev", vector_reverse>;
+  defm REV_ZZ : sve_int_perm_reverse_z<"rev", vector_reverse>;

  defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo", AArch64sunpklo>;
  defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi", AArch64sunpkhi>;
--- a/test/CodeGen/AArch64/named-vector-shuffle-reverse-neon.ll
+++ b/test/CodeGen/AArch64/named-vector-shuffle-reverse-neon.ll
@ -0,0 +1,230 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs  < %s 2>%t | FileCheck --check-prefix=CHECK --check-prefix=CHECK-SELDAG  %s
+; RUN: llc -verify-machineinstrs -O0 < %s 2>%t | FileCheck --check-prefix=CHECK --check-prefix=CHECK-FASTISEL %s
+
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; VECTOR_REVERSE
+;
+
+define <16 x i8> @reverse_v16i8(<16 x i8> %a) #0 {
+; CHECK-LABEL: .LCPI0_0:
+; CHECK:        .byte   15                      // 0xf
+; CHECK-NEXT:   .byte   14                      // 0xe
+; CHECK-NEXT:   .byte   13                      // 0xd
+; CHECK-NEXT:   .byte   12                      // 0xc
+; CHECK-NEXT:   .byte   11                      // 0xb
+; CHECK-NEXT:   .byte   10                      // 0xa
+; CHECK-NEXT:   .byte   9                       // 0x9
+; CHECK-NEXT:   .byte   8                       // 0x8
+; CHECK-NEXT:   .byte   7                       // 0x7
+; CHECK-NEXT:   .byte   6                       // 0x6
+; CHECK-NEXT:   .byte   5                       // 0x5
+; CHECK-NEXT:   .byte   4                       // 0x4
+; CHECK-NEXT:   .byte   3                       // 0x3
+; CHECK-NEXT:   .byte   2                       // 0x2
+; CHECK-NEXT:   .byte   1                       // 0x1
+; CHECK-NEXT:   .byte   0                       // 0x0
+; CHECK-LABEL: reverse_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI0_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI0_0]
+; CHECK-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT:    ret
+
+  %res = call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> %a)
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @reverse_v8i16(<8 x i16> %a) #0 {
+; CHECK-LABEL: .LCPI1_0:
+; CHECK:        .byte   14                      // 0xe
+; CHECK-NEXT:   .byte   15                      // 0xf
+; CHECK-NEXT:   .byte   12                      // 0xc
+; CHECK-NEXT:   .byte   13                      // 0xd
+; CHECK-NEXT:   .byte   10                      // 0xa
+; CHECK-NEXT:   .byte   11                      // 0xb
+; CHECK-NEXT:   .byte   8                       // 0x8
+; CHECK-NEXT:   .byte   9                       // 0x9
+; CHECK-NEXT:   .byte   6                       // 0x6
+; CHECK-NEXT:   .byte   7                       // 0x7
+; CHECK-NEXT:   .byte   4                       // 0x4
+; CHECK-NEXT:   .byte   5                       // 0x5
+; CHECK-NEXT:   .byte   2                       // 0x2
+; CHECK-NEXT:   .byte   3                       // 0x3
+; CHECK-NEXT:   .byte   0                       // 0x0
+; CHECK-NEXT:   .byte   1                       // 0x1
+; CHECK-LABEL: reverse_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI1_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI1_0]
+; CHECK-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT:    ret
+
+  %res = call <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16> %a)
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @reverse_v4i32(<4 x i32> %a) #0 {
+; CHECK-LABEL: reverse_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ret
+
+  %res = call <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32> %a)
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @reverse_v2i64(<2 x i64> %a) #0 {
+; CHECK-LABEL: reverse_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ret
+
+  %res = call <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64> %a)
+  ret <2 x i64> %res
+}
+
+define <8 x half> @reverse_v8f16(<8 x half> %a) #0 {
+; CHECK-LABEL: .LCPI4_0:
+; CHECK:        .byte   14                      // 0xe
+; CHECK-NEXT:   .byte   15                      // 0xf
+; CHECK-NEXT:   .byte   12                      // 0xc
+; CHECK-NEXT:   .byte   13                      // 0xd
+; CHECK-NEXT:   .byte   10                      // 0xa
+; CHECK-NEXT:   .byte   11                      // 0xb
+; CHECK-NEXT:   .byte   8                       // 0x8
+; CHECK-NEXT:   .byte   9                       // 0x9
+; CHECK-NEXT:   .byte   6                       // 0x6
+; CHECK-NEXT:   .byte   7                       // 0x7
+; CHECK-NEXT:   .byte   4                       // 0x4
+; CHECK-NEXT:   .byte   5                       // 0x5
+; CHECK-NEXT:   .byte   2                       // 0x2
+; CHECK-NEXT:   .byte   3                       // 0x3
+; CHECK-NEXT:   .byte   0                       // 0x0
+; CHECK-NEXT:   .byte   1                       // 0x1
+; CHECK-LABEL: reverse_v8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, .LCPI4_0
+; CHECK-NEXT:    ldr q1, [x8, :lo12:.LCPI4_0]
+; CHECK-NEXT:    tbl v0.16b, { v0.16b }, v1.16b
+; CHECK-NEXT:    ret
+
+  %res = call <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half> %a)
+  ret <8 x half> %res
+}
+
+define <4 x float> @reverse_v4f32(<4 x float> %a) #0 {
+; CHECK-LABEL: reverse_v4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev64 v0.4s, v0.4s
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ret
+
+  %res = call <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float> %a)
+  ret <4 x float> %res
+}
+
+define <2 x double> @reverse_v2f64(<2 x double> %a) #0 {
+; CHECK-LABEL: reverse_v2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ret
+
+  %res = call <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double> %a)
+  ret <2 x double> %res
+}
+
+; Verify promote type legalisation works as expected.
+define <2 x i8> @reverse_v2i8(<2 x i8> %a) #0 {
+; CHECK-LABEL: reverse_v2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev64 v0.2s, v0.2s
+; CHECK-NEXT:    ret
+
+  %res = call <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8> %a)
+  ret <2 x i8> %res
+}
+
+; Verify splitvec type legalisation works as expected.
+define <8 x i32> @reverse_v8i32(<8 x i32> %a) #0 {
+; CHECK-LABEL: reverse_v8i32:
+; CHECK-SELDAG:       // %bb.0:
+; CHECK-SELDAG-NEXT:    rev64 v1.4s, v1.4s
+; CHECK-SELDAG-NEXT:    rev64 v2.4s, v0.4s
+; CHECK-SELDAG-NEXT:    ext v0.16b, v1.16b, v1.16b, #8
+; CHECK-SELDAG-NEXT:    ext v1.16b, v2.16b, v2.16b, #8
+; CHECK-SELDAG-NEXT:    ret
+; CHECK-FASTISEL:       // %bb.0:
+; CHECK-FASTISEL-NEXT:    sub    sp, sp, #16
+; CHECK-FASTISEL-NEXT:    str    q1, [sp]
+; CHECK-FASTISEL-NEXT:    mov    v1.16b, v0.16b
+; CHECK-FASTISEL-NEXT:    ldr    q0, [sp]
+; CHECK-FASTISEL-NEXT:    rev64    v0.4s, v0.4s
+; CHECK-FASTISEL-NEXT:    ext    v0.16b, v0.16b, v0.16b, #8
+; CHECK-FASTISEL-NEXT:    rev64    v1.4s, v1.4s
+; CHECK-FASTISEL-NEXT:    ext    v1.16b, v1.16b, v1.16b, #8
+; CHECK-FASTISEL-NEXT:    add    sp, sp, #16
+; CHECK-FASTISEL-NEXT:    ret
+
+  %res = call <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32> %a)
+  ret <8 x i32> %res
+}
+
+; Verify splitvec type legalisation works as expected.
+define <16 x float> @reverse_v16f32(<16 x float> %a) #0 {
+; CHECK-LABEL: reverse_v16f32:
+; CHECK-SELDAG:       // %bb.0:
+; CHECK-SELDAG-NEXT:    rev64 v3.4s, v3.4s
+; CHECK-SELDAG-NEXT:    rev64 v2.4s, v2.4s
+; CHECK-SELDAG-NEXT:    rev64 v4.4s, v1.4s
+; CHECK-SELDAG-NEXT:    rev64 v5.4s, v0.4s
+; CHECK-SELDAG-NEXT:    ext v0.16b, v3.16b, v3.16b, #8
+; CHECK-SELDAG-NEXT:    ext v1.16b, v2.16b, v2.16b, #8
+; CHECK-SELDAG-NEXT:    ext v2.16b, v4.16b, v4.16b, #8
+; CHECK-SELDAG-NEXT:    ext v3.16b, v5.16b, v5.16b, #8
+; CHECK-SELDAG-NEXT:    ret
+; CHECK-FASTISEL:       // %bb.0:
+; CHECK-FASTISEL-NEXT:    sub    sp, sp, #32
+; CHECK-FASTISEL-NEXT:    str    q3, [sp, #16]
+; CHECK-FASTISEL-NEXT:    str    q2, [sp]
+; CHECK-FASTISEL-NEXT:    mov    v2.16b, v1.16b
+; CHECK-FASTISEL-NEXT:    ldr    q1, [sp]
+; CHECK-FASTISEL-NEXT:    mov    v3.16b, v0.16b
+; CHECK-FASTISEL-NEXT:    ldr    q0, [sp, #16]
+; CHECK-FASTISEL-NEXT:    rev64    v0.4s, v0.4s
+; CHECK-FASTISEL-NEXT:    ext    v0.16b, v0.16b, v0.16b, #8
+; CHECK-FASTISEL-NEXT:    rev64    v1.4s, v1.4s
+; CHECK-FASTISEL-NEXT:    ext    v1.16b, v1.16b, v1.16b, #8
+; CHECK-FASTISEL-NEXT:    rev64    v2.4s, v2.4s
+; CHECK-FASTISEL-NEXT:    ext    v2.16b, v2.16b, v2.16b, #8
+; CHECK-FASTISEL-NEXT:    rev64    v3.4s, v3.4s
+; CHECK-FASTISEL-NEXT:    ext    v3.16b, v3.16b, v3.16b, #8
+; CHECK-FASTISEL-NEXT:    add    sp, sp, #32
+; CHECK-FASTISEL-NEXT:    ret
+
+  %res = call <16 x float> @llvm.experimental.vector.reverse.v16f32(<16 x float> %a)
+  ret <16 x float> %res
+}
+
+
+declare <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8>)
+declare <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8>)
+declare <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16>)
+declare <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32>)
+declare <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32>)
+declare <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64>)
+declare <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half>)
+declare <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float>)
+declare <16 x float> @llvm.experimental.vector.reverse.v16f32(<16 x float>)
+declare <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double>)
+
+attributes #0 = { nounwind "target-features"="+neon" }
--- a/test/CodeGen/AArch64/named-vector-shuffle-reverse-sve.ll
+++ b/test/CodeGen/AArch64/named-vector-shuffle-reverse-sve.ll
@ -0,0 +1,238 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs  < %s 2>%t | FileCheck --check-prefix=CHECK --check-prefix=CHECK-SELDAG  %s
+; RUN: llc -verify-machineinstrs -O0 < %s 2>%t | FileCheck --check-prefix=CHECK --check-prefix=CHECK-FASTISEL %s
+
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+target triple = "aarch64-unknown-linux-gnu"
+
+;
+; VECTOR_REVERSE - PPR
+;
+
+define <vscale x 2 x i1> @reverse_nxv2i1(<vscale x 2 x i1> %a) #0 {
+; CHECK-LABEL: reverse_nxv2i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev p0.d, p0.d
+; CHECK-NEXT:    ret
+
+  %res = call <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1> %a)
+  ret <vscale x 2 x i1> %res
+}
+
+define <vscale x 4 x i1> @reverse_nxv4i1(<vscale x 4 x i1> %a) #0 {
+; CHECK-LABEL: reverse_nxv4i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev p0.s, p0.s
+; CHECK-NEXT:    ret
+
+  %res = call <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1> %a)
+  ret <vscale x 4 x i1> %res
+}
+
+define <vscale x 8 x i1> @reverse_nxv8i1(<vscale x 8 x i1> %a) #0 {
+; CHECK-LABEL: reverse_nxv8i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev p0.h, p0.h
+; CHECK-NEXT:    ret
+
+  %res = call <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1> %a)
+  ret <vscale x 8 x i1> %res
+}
+
+define <vscale x 16 x i1> @reverse_nxv16i1(<vscale x 16 x i1> %a) #0 {
+; CHECK-LABEL: reverse_nxv16i1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev p0.b, p0.b
+; CHECK-NEXT:    ret
+
+  %res = call <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1> %a)
+  ret <vscale x 16 x i1> %res
+}
+
+; Verify splitvec type legalisation works as expected.
+define <vscale x 32 x i1> @reverse_nxv32i1(<vscale x 32 x i1> %a) #0 {
+; CHECK-LABEL: reverse_nxv32i1:
+; CHECK-SELDAG:       // %bb.0:
+; CHECK-SELDAG-NEXT:    rev p2.b, p1.b
+; CHECK-SELDAG-NEXT:    rev p1.b, p0.b
+; CHECK-SELDAG-NEXT:    mov p0.b, p2.b
+; CHECK-SELDAG-NEXT:    ret
+; CHECK-FASTISEL:       // %bb.0:
+; CHECK-FASTISEL-NEXT:    str    x29, [sp, #-16]
+; CHECK-FASTISEL-NEXT:    addvl    sp, sp, #-1
+; CHECK-FASTISEL-NEXT:    str    p1, [sp, #7, mul vl]
+; CHECK-FASTISEL-NEXT:    mov    p1.b, p0.b
+; CHECK-FASTISEL-NEXT:    ldr    p0, [sp, #7, mul vl]
+; CHECK-FASTISEL-NEXT:    rev    p0.b, p0.b
+; CHECK-FASTISEL-NEXT:    rev    p1.b, p1.b
+; CHECK-FASTISEL-NEXT:    addvl    sp, sp, #1
+; CHECK-FASTISEL-NEXT:    ldr    x29, [sp], #16
+; CHECK-FASTISEL-NEXT:    ret
+
+  %res = call <vscale x 32 x i1> @llvm.experimental.vector.reverse.nxv32i1(<vscale x 32 x i1> %a)
+  ret <vscale x 32 x i1> %res
+}
+
+;
+; VECTOR_REVERSE - ZPR
+;
+
+define <vscale x 16 x i8> @reverse_nxv16i8(<vscale x 16 x i8> %a) #0 {
+; CHECK-LABEL: reverse_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev z0.b, z0.b
+; CHECK-NEXT:    ret
+
+  %res = call <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8> %a)
+  ret <vscale x 16 x i8> %res
+}
+
+define <vscale x 8 x i16> @reverse_nxv8i16(<vscale x 8 x i16> %a) #0 {
+; CHECK-LABEL: reverse_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev z0.h, z0.h
+; CHECK-NEXT:    ret
+
+  %res = call <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16> %a)
+  ret <vscale x 8 x i16> %res
+}
+
+define <vscale x 4 x i32> @reverse_nxv4i32(<vscale x 4 x i32> %a) #0 {
+; CHECK-LABEL: reverse_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev z0.s, z0.s
+; CHECK-NEXT:    ret
+
+  %res = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  ret <vscale x 4 x i32> %res
+}
+
+define <vscale x 2 x i64> @reverse_nxv2i64(<vscale x 2 x i64> %a) #0 {
+; CHECK-LABEL: reverse_nxv2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev z0.d, z0.d
+; CHECK-NEXT:    ret
+
+  %res = call <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64> %a)
+  ret <vscale x 2 x i64> %res
+}
+
+define <vscale x 8 x half> @reverse_nxv8f16(<vscale x 8 x half> %a) #0 {
+; CHECK-LABEL: reverse_nxv8f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev z0.h, z0.h
+; CHECK-NEXT:    ret
+
+  %res = call <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half> %a)
+  ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x float> @reverse_nxv4f32(<vscale x 4 x float> %a) #0 {
+; CHECK-LABEL: reverse_nxv4f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev z0.s, z0.s
+; CHECK-NEXT:    ret
+
+  %res = call <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float> %a)  ret <vscale x 4 x float> %res
+}
+
+define <vscale x 2 x double> @reverse_nxv2f64(<vscale x 2 x double> %a) #0 {
+; CHECK-LABEL: reverse_nxv2f64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev z0.d, z0.d
+; CHECK-NEXT:    ret
+
+  %res = call <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double> %a)
+  ret <vscale x 2 x double> %res
+}
+
+; Verify promote type legalisation works as expected.
+define <vscale x 2 x i8> @reverse_nxv2i8(<vscale x 2 x i8> %a) #0 {
+; CHECK-LABEL: reverse_nxv2i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rev z0.d, z0.d
+; CHECK-NEXT:    ret
+
+  %res = call <vscale x 2 x i8> @llvm.experimental.vector.reverse.nxv2i8(<vscale x 2 x i8> %a)
+  ret <vscale x 2 x i8> %res
+}
+
+; Verify splitvec type legalisation works as expected.
+define <vscale x 8 x i32> @reverse_nxv8i32(<vscale x 8 x i32> %a) #0 {
+; CHECK-LABEL: reverse_nxv8i32:
+; CHECK-SELDAG:       // %bb.0:
+; CHECK-SELDAG-NEXT:    rev z2.s, z1.s
+; CHECK-SELDAG-NEXT:    rev z1.s, z0.s
+; CHECK-SELDAG-NEXT:    mov z0.d, z2.d
+; CHECK-SELDAG-NEXT:    ret
+; CHECK-FASTISEL:       // %bb.0:
+; CHECK-FASTISEL-NEXT:    str    x29, [sp, #-16]
+; CHECK-FASTISEL-NEXT:    addvl    sp, sp, #-1
+; CHECK-FASTISEL-NEXT:    str    z1, [sp]
+; CHECK-FASTISEL-NEXT:    mov    z1.d, z0.d
+; CHECK-FASTISEL-NEXT:    ldr    z0, [sp]
+; CHECK-FASTISEL-NEXT:    rev    z0.s, z0.s
+; CHECK-FASTISEL-NEXT:    rev    z1.s, z1.s
+; CHECK-FASTISEL-NEXT:    addvl    sp, sp, #1
+; CHECK-FASTISEL-NEXT:    ldr    x29, [sp], #16
+; CHECK-FASTISEL-NEXT:    ret
+
+  %res = call <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32> %a)
+  ret <vscale x 8 x i32> %res
+}
+
+; Verify splitvec type legalisation works as expected.
+define <vscale x 16 x float> @reverse_nxv16f32(<vscale x 16 x float> %a) #0 {
+; CHECK-LABEL: reverse_nxv16f32:
+; CHECK-SELDAG:       // %bb.0:
+; CHECK-SELDAG-NEXT:    rev z5.s, z3.s
+; CHECK-SELDAG-NEXT:    rev z4.s, z2.s
+; CHECK-SELDAG-NEXT:    rev z2.s, z1.s
+; CHECK-SELDAG-NEXT:    rev z3.s, z0.s
+; CHECK-SELDAG-NEXT:    mov z0.d, z5.d
+; CHECK-SELDAG-NEXT:    mov z1.d, z4.d
+; CHECK-SELDAG-NEXT:    ret
+; CHECK-FASTISEL:       // %bb.0:
+; CHECK-FASTISEL-NEXT:    str    x29, [sp, #-16]
+; CHECK-FASTISEL-NEXT:    addvl    sp, sp, #-2
+; CHECK-FASTISEL-NEXT:    str    z3, [sp, #1, mul vl]
+; CHECK-FASTISEL-NEXT:    str    z2, [sp]
+; CHECK-FASTISEL-NEXT:    mov    z2.d, z1.d
+; CHECK-FASTISEL-NEXT:    ldr    z1, [sp]
+; CHECK-FASTISEL-NEXT:    mov    z3.d, z0.d
+; CHECK-FASTISEL-NEXT:    ldr    z0, [sp, #1, mul vl]
+; CHECK-FASTISEL-NEXT:    rev    z0.s, z0.s
+; CHECK-FASTISEL-NEXT:    rev    z1.s, z1.s
+; CHECK-FASTISEL-NEXT:    rev    z2.s, z2.s
+; CHECK-FASTISEL-NEXT:    rev    z3.s, z3.s
+; CHECK-FASTISEL-NEXT:    addvl    sp, sp, #2
+; CHECK-FASTISEL-NEXT:    ldr    x29, [sp], #16
+; CHECK-FASTISEL-NEXT:    ret
+
+  %res = call <vscale x 16 x float> @llvm.experimental.vector.reverse.nxv16f32(<vscale x 16 x float> %a)
+  ret <vscale x 16 x float> %res
+}
+
+
+declare <vscale x 2 x i1> @llvm.experimental.vector.reverse.nxv2i1(<vscale x 2 x i1>)
+declare <vscale x 4 x i1> @llvm.experimental.vector.reverse.nxv4i1(<vscale x 4 x i1>)
+declare <vscale x 8 x i1> @llvm.experimental.vector.reverse.nxv8i1(<vscale x 8 x i1>)
+declare <vscale x 16 x i1> @llvm.experimental.vector.reverse.nxv16i1(<vscale x 16 x i1>)
+declare <vscale x 32 x i1> @llvm.experimental.vector.reverse.nxv32i1(<vscale x 32 x i1>)
+declare <vscale x 2 x i8> @llvm.experimental.vector.reverse.nxv2i8(<vscale x 2 x i8>)
+declare <vscale x 16 x i8> @llvm.experimental.vector.reverse.nxv16i8(<vscale x 16 x i8>)
+declare <vscale x 8 x i16> @llvm.experimental.vector.reverse.nxv8i16(<vscale x 8 x i16>)
+declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32>)
+declare <vscale x 8 x i32> @llvm.experimental.vector.reverse.nxv8i32(<vscale x 8 x i32>)
+declare <vscale x 2 x i64> @llvm.experimental.vector.reverse.nxv2i64(<vscale x 2 x i64>)
+declare <vscale x 8 x half> @llvm.experimental.vector.reverse.nxv8f16(<vscale x 8 x half>)
+declare <vscale x 4 x float> @llvm.experimental.vector.reverse.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 16 x float> @llvm.experimental.vector.reverse.nxv16f32(<vscale x 16 x float>)
+declare <vscale x 2 x double> @llvm.experimental.vector.reverse.nxv2f64(<vscale x 2 x double>)
+
+
+attributes #0 = { nounwind "target-features"="+sve" }
--- a/test/CodeGen/X86/named-vector-shuffle-reverse.ll
+++ b/test/CodeGen/X86/named-vector-shuffle-reverse.ll
@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs  < %s 2>%t | FileCheck %s
+
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+
+target triple = "x86_64-unknown-unknown"
+
+;
+; VECTOR_REVERSE
+;
+
+define <16 x i8> @reverse_v16i8(<16 x i8> %a) #0 {
+; CHECK-LABEL: reverse_v16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pxor      %xmm1, %xmm1
+; CHECK-NEXT:    movdqa    %xmm0, %xmm2
+; CHECK-NEXT:    punpcklbw  %xmm1, %xmm
+; CHECK-NEXT:    pshufd     $78, %xmm2, %xmm2
+; CHECK-NEXT:    pshuflw    $27, %xmm2, %xmm2
+; CHECK-NEXT:    pshufhw    $27, %xmm2, %xmm2
+; CHECK-NEXT:    punpckhbw  %xmm1, %xmm0
+; CHECK-NEXT:    pshufd     $78, %xmm0, %xmm0
+; CHECK-NEXT:    pshuflw    $27, %xmm0, %xmm0
+; CHECK-NEXT:    pshufhw    $27, %xmm0, %xmm0
+; CHECK-NEXT:    packuswb   %xmm2, %xmm0
+; CHECK-NEXT:    retq
+
+  %res = call <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8> %a)
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @reverse_v8i16(<8 x i16> %a) #0 {
+; CHECK-LABEL: reverse_v8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pshufd     $78, %xmm0, %xmm
+; CHECK-NEXT:    pshuflw    $27, %xmm0, %xmm0
+; CHECK-NEXT:    pshufhw    $27, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16> %a)
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @reverse_v4i32(<4 x i32> %a) #0 {
+; CHECK-LABEL: reverse_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pshufd    $27, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32> %a)
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @reverse_v2i64(<2 x i64> %a) #0 {
+; CHECK-LABEL: reverse_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pshufd    $78, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64> %a)
+  ret <2 x i64> %res
+}
+
+define <4 x float> @reverse_v4f32(<4 x float> %a) #0 {
+; CHECK-LABEL: reverse_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    shufps    $27, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float> %a)
+  ret <4 x float> %res
+}
+
+define <2 x double> @reverse_v2f64(<2 x double> %a) #0 {
+; CHECK-LABEL: reverse_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    shufps    $78, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double> %a)
+  ret <2 x double> %res
+}
+
+; Verify promote type legalisation works as expected.
+define <2 x i8> @reverse_v2i8(<2 x i8> %a) #0 {
+; CHECK-LABEL: reverse_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movdqa   %xmm0, %xmm1
+; CHECK-NEXT:    psrlw    $8, %xmm1
+; CHECK-NEXT:    psllw    $8, %xmm0
+; CHECK-NEXT:    por      %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8> %a)
+  ret <2 x i8> %res
+}
+
+; Verify splitvec type legalisation works as expected.
+define <8 x i32> @reverse_v8i32(<8 x i32> %a) #0 {
+; CHECK-LABEL: reverse_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pshufd    $27, %xmm1, %xmm2
+; CHECK-NEXT:    pshufd    $27, %xmm0, %xmm1
+; CHECK-NEXT:    movdqa %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32> %a)
+  ret <8 x i32> %res
+}
+
+; Verify splitvec type legalisation works as expected.
+define <16 x float> @reverse_v16f32(<16 x float> %a) #0 {
+; CHECK-LABEL: reverse_v16f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movaps    %xmm1, %xmm4
+; CHECK-NEXT:    movaps    %xmm0, %xmm5
+; CHECK-NEXT:    shufps    $27, %xmm3, %xmm
+; CHECK-NEXT:    shufps    $27, %xmm2, %xmm2
+; CHECK-NEXT:    shufps    $27, %xmm1, %xmm4
+; CHECK-NEXT:    shufps    $27, %xmm0, %xmm5
+; CHECK-NEXT:    movaps    %xmm3, %xmm0
+; CHECK-NEXT:    movaps    %xmm2, %xmm1
+; CHECK-NEXT:    movaps    %xmm4, %xmm2
+; CHECK-NEXT:    movaps    %xmm5, %xmm3
+
+  %res = call <16 x float> @llvm.experimental.vector.reverse.v16f32(<16 x float> %a)
+  ret <16 x float> %res
+}
+
+
+declare <2 x i8> @llvm.experimental.vector.reverse.v2i8(<2 x i8>)
+declare <16 x i8> @llvm.experimental.vector.reverse.v16i8(<16 x i8>)
+declare <8 x i16> @llvm.experimental.vector.reverse.v8i16(<8 x i16>)
+declare <4 x i32> @llvm.experimental.vector.reverse.v4i32(<4 x i32>)
+declare <8 x i32> @llvm.experimental.vector.reverse.v8i32(<8 x i32>)
+declare <2 x i64> @llvm.experimental.vector.reverse.v2i64(<2 x i64>)
+declare <8 x half> @llvm.experimental.vector.reverse.v8f16(<8 x half>)
+declare <4 x float> @llvm.experimental.vector.reverse.v4f32(<4 x float>)
+declare <16 x float> @llvm.experimental.vector.reverse.v16f32(<16 x float>)
+declare <2 x double> @llvm.experimental.vector.reverse.v2f64(<2 x double>)
+
+attributes #0 = { nounwind }
--- a/test/Transforms/InstSimplify/named-vector-shuffle-reverse.ll
+++ b/test/Transforms/InstSimplify/named-vector-shuffle-reverse.ll
@ -0,0 +1,17 @@
+; RUN: opt  -instsimplify -S < %s 2>%t | FileCheck %s
+
+; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
+
+; If this check fails please read test/CodeGen/AArch64/README for instructions on how to resolve it.
+; WARN-NOT: warning
+
+; Test back to back reverse shuffles are eliminated.
+define <vscale x 4 x i32> @shuffle_b2b_reverse(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: @shuffle_b2b_reverse(
+; CHECK: ret <vscale x 4 x i32> %a
+  %rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
+  %rev.rev = tail call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> %rev)
+  ret <vscale x 4 x i32> %rev.rev
+}
+
+declare <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32>)