[AArch64][SVE] Add intrinsics for binary narrowing operations

Summary: The following intrinsics for binary narrowing shift righ operations are added: * @llvm.aarch64.sve.shrnb * @llvm.aarch64.sve.uqshrnb * @llvm.aarch64.sve.sqshrnb * @llvm.aarch64.sve.sqshrunb * @llvm.aarch64.sve.uqrshrnb * @llvm.aarch64.sve.sqrshrnb * @llvm.aarch64.sve.sqrshrunb * @llvm.aarch64.sve.shrnt * @llvm.aarch64.sve.uqshrnt * @llvm.aarch64.sve.sqshrnt * @llvm.aarch64.sve.sqshrunt * @llvm.aarch64.sve.uqrshrnt * @llvm.aarch64.sve.sqrshrnt * @llvm.aarch64.sve.sqrshrunt Reviewers: sdesmalen, rengolin, efriedma Reviewed By: efriedma Subscribers: tschuett, kristof.beyls, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D71552
2024-11-23 19:23:23 +01:00 · 2019-12-20 09:27:10 +00:00 · 2019-12-20 09:27:10 +00:00 · 03e8b97579
commit 03e8b97579
parent 3ddecf8632
5 changed files with 610 additions and 22 deletions
--- a/include/llvm/IR/IntrinsicsAArch64.td
+++ b/include/llvm/IR/IntrinsicsAArch64.td
@ -1021,6 +1021,17 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
            [LLVMSubdivide2VectorType<0>, llvm_anyvector_ty, LLVMMatchType<0>],
            [IntrNoMem]>;

+  class SVE2_1VectorArg_Imm_Narrowing_Intrinsic
+      : Intrinsic<[LLVMSubdivide2VectorType<0>],
+                  [llvm_anyvector_ty, llvm_i32_ty],
+                  [IntrNoMem, ImmArg<1>]>;
+
+  class SVE2_2VectorArg_Imm_Narrowing_Intrinsic
+      : Intrinsic<[LLVMSubdivide2VectorType<0>],
+                  [LLVMSubdivide2VectorType<0>, llvm_anyvector_ty,
+                   llvm_i32_ty],
+                  [IntrNoMem, ImmArg<2>]>;
+
  // NOTE: There is no relationship between these intrinsics beyond an attempt
  // to reuse currently identical class definitions.
  class AdvSIMD_SVE_LOGB_Intrinsic  : AdvSIMD_SVE_CNT_Intrinsic;
@ -1559,4 +1570,32 @@ def int_aarch64_sve_subhnt    : SVE2_Merged2VectorArg_Narrowing_Intrinsic;

 def int_aarch64_sve_rsubhnb   : SVE2_2VectorArg_Narrowing_Intrinsic;
 def int_aarch64_sve_rsubhnt   : SVE2_Merged2VectorArg_Narrowing_Intrinsic;
+
+// Narrowing shift right
+def int_aarch64_sve_shrnb     : SVE2_1VectorArg_Imm_Narrowing_Intrinsic;
+def int_aarch64_sve_shrnt     : SVE2_2VectorArg_Imm_Narrowing_Intrinsic;
+
+def int_aarch64_sve_rshrnb    : SVE2_1VectorArg_Imm_Narrowing_Intrinsic;
+def int_aarch64_sve_rshrnt    : SVE2_2VectorArg_Imm_Narrowing_Intrinsic;
+
+// Saturating shift right - signed input/output
+def int_aarch64_sve_sqshrnb   : SVE2_1VectorArg_Imm_Narrowing_Intrinsic;
+def int_aarch64_sve_sqshrnt   : SVE2_2VectorArg_Imm_Narrowing_Intrinsic;
+
+def int_aarch64_sve_sqrshrnb  : SVE2_1VectorArg_Imm_Narrowing_Intrinsic;
+def int_aarch64_sve_sqrshrnt  : SVE2_2VectorArg_Imm_Narrowing_Intrinsic;
+
+// Saturating shift right - unsigned input/output
+def int_aarch64_sve_uqshrnb   : SVE2_1VectorArg_Imm_Narrowing_Intrinsic;
+def int_aarch64_sve_uqshrnt   : SVE2_2VectorArg_Imm_Narrowing_Intrinsic;
+
+def int_aarch64_sve_uqrshrnb  : SVE2_1VectorArg_Imm_Narrowing_Intrinsic;
+def int_aarch64_sve_uqrshrnt  : SVE2_2VectorArg_Imm_Narrowing_Intrinsic;
+
+// Saturating shift right - signed input, unsigned output
+def int_aarch64_sve_sqshrunb  : SVE2_1VectorArg_Imm_Narrowing_Intrinsic;
+def int_aarch64_sve_sqshrunt  : SVE2_2VectorArg_Imm_Narrowing_Intrinsic;
+
+def int_aarch64_sve_sqrshrunb : SVE2_1VectorArg_Imm_Narrowing_Intrinsic;
+def int_aarch64_sve_sqrshrunt : SVE2_2VectorArg_Imm_Narrowing_Intrinsic;
 }
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@ -624,6 +624,30 @@ def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{
  let ParserMatchClass = Imm1_32Operand;
 }

+// Same as vecshiftR#N, but use TargetConstant (TimmLeaf) instead of Constant
+// (ImmLeaf)
+def tvecshiftR8 : Operand<i32>, TImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
+}]> {
+  let EncoderMethod = "getVecShiftR8OpValue";
+  let DecoderMethod = "DecodeVecShiftR8Imm";
+  let ParserMatchClass = Imm1_8Operand;
+}
+def tvecshiftR16 : Operand<i32>, TImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
+}]> {
+  let EncoderMethod = "getVecShiftR16OpValue";
+  let DecoderMethod = "DecodeVecShiftR16Imm";
+  let ParserMatchClass = Imm1_16Operand;
+}
+def tvecshiftR32 : Operand<i32>, TImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
+}]> {
+  let EncoderMethod = "getVecShiftR32OpValue";
+  let DecoderMethod = "DecodeVecShiftR32Imm";
+  let ParserMatchClass = Imm1_32Operand;
+}
+
 def Imm0_1Operand : AsmImmRange<0, 1>;
 def Imm0_7Operand : AsmImmRange<0, 7>;
 def Imm0_15Operand : AsmImmRange<0, 15>;
--- a/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/lib/Target/AArch64/AArch64SVEInstrInfo.td
@ -1426,24 +1426,24 @@ let Predicates = [HasSVE2] in {
  defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt">;

  // SVE2 bitwise shift right narrow (bottom)
-  defm SQSHRUNB_ZZI  : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb">;
-  defm SQRSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b001, "sqrshrunb">;
-  defm SHRNB_ZZI     : sve2_int_bin_shift_imm_right_narrow_bottom<0b010, "shrnb">;
-  defm RSHRNB_ZZI    : sve2_int_bin_shift_imm_right_narrow_bottom<0b011, "rshrnb">;
-  defm SQSHRNB_ZZI   : sve2_int_bin_shift_imm_right_narrow_bottom<0b100, "sqshrnb">;
-  defm SQRSHRNB_ZZI  : sve2_int_bin_shift_imm_right_narrow_bottom<0b101, "sqrshrnb">;
-  defm UQSHRNB_ZZI   : sve2_int_bin_shift_imm_right_narrow_bottom<0b110, "uqshrnb">;
-  defm UQRSHRNB_ZZI  : sve2_int_bin_shift_imm_right_narrow_bottom<0b111, "uqrshrnb">;
+  defm SQSHRUNB_ZZI  : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb",  int_aarch64_sve_sqshrunb>;
+  defm SQRSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b001, "sqrshrunb", int_aarch64_sve_sqrshrunb>;
+  defm SHRNB_ZZI     : sve2_int_bin_shift_imm_right_narrow_bottom<0b010, "shrnb",     int_aarch64_sve_shrnb>;
+  defm RSHRNB_ZZI    : sve2_int_bin_shift_imm_right_narrow_bottom<0b011, "rshrnb",    int_aarch64_sve_rshrnb>;
+  defm SQSHRNB_ZZI   : sve2_int_bin_shift_imm_right_narrow_bottom<0b100, "sqshrnb",   int_aarch64_sve_sqshrnb>;
+  defm SQRSHRNB_ZZI  : sve2_int_bin_shift_imm_right_narrow_bottom<0b101, "sqrshrnb",  int_aarch64_sve_sqrshrnb>;
+  defm UQSHRNB_ZZI   : sve2_int_bin_shift_imm_right_narrow_bottom<0b110, "uqshrnb",   int_aarch64_sve_uqshrnb>;
+  defm UQRSHRNB_ZZI  : sve2_int_bin_shift_imm_right_narrow_bottom<0b111, "uqrshrnb",  int_aarch64_sve_uqrshrnb>;

  // SVE2 bitwise shift right narrow (top)
-  defm SQSHRUNT_ZZI  : sve2_int_bin_shift_imm_right_narrow_top<0b000, "sqshrunt">;
-  defm SQRSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b001, "sqrshrunt">;
-  defm SHRNT_ZZI     : sve2_int_bin_shift_imm_right_narrow_top<0b010, "shrnt">;
-  defm RSHRNT_ZZI    : sve2_int_bin_shift_imm_right_narrow_top<0b011, "rshrnt">;
-  defm SQSHRNT_ZZI   : sve2_int_bin_shift_imm_right_narrow_top<0b100, "sqshrnt">;
-  defm SQRSHRNT_ZZI  : sve2_int_bin_shift_imm_right_narrow_top<0b101, "sqrshrnt">;
-  defm UQSHRNT_ZZI   : sve2_int_bin_shift_imm_right_narrow_top<0b110, "uqshrnt">;
-  defm UQRSHRNT_ZZI  : sve2_int_bin_shift_imm_right_narrow_top<0b111, "uqrshrnt">;
+  defm SQSHRUNT_ZZI  : sve2_int_bin_shift_imm_right_narrow_top<0b000, "sqshrunt",  int_aarch64_sve_sqshrunt>;
+  defm SQRSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b001, "sqrshrunt", int_aarch64_sve_sqrshrunt>;
+  defm SHRNT_ZZI     : sve2_int_bin_shift_imm_right_narrow_top<0b010, "shrnt",     int_aarch64_sve_shrnt>;
+  defm RSHRNT_ZZI    : sve2_int_bin_shift_imm_right_narrow_top<0b011, "rshrnt",    int_aarch64_sve_rshrnt>;
+  defm SQSHRNT_ZZI   : sve2_int_bin_shift_imm_right_narrow_top<0b100, "sqshrnt",   int_aarch64_sve_sqshrnt>;
+  defm SQRSHRNT_ZZI  : sve2_int_bin_shift_imm_right_narrow_top<0b101, "sqrshrnt",  int_aarch64_sve_sqrshrnt>;
+  defm UQSHRNT_ZZI   : sve2_int_bin_shift_imm_right_narrow_top<0b110, "uqshrnt",   int_aarch64_sve_uqshrnt>;
+  defm UQRSHRNT_ZZI  : sve2_int_bin_shift_imm_right_narrow_top<0b111, "uqrshrnt",  int_aarch64_sve_uqrshrnt>;

  // SVE2 integer add/subtract narrow high part (bottom)
  defm ADDHNB_ZZZ  : sve2_int_addsub_narrow_high_bottom<0b00, "addhnb",  int_aarch64_sve_addhnb>;
--- a/lib/Target/AArch64/SVEInstrFormats.td
+++ b/lib/Target/AArch64/SVEInstrFormats.td
@ -334,6 +334,11 @@ class SVE_4_Op_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
 : Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3, vt4:$Op4)),
      (inst $Op1, $Op2, $Op3, $Op4)>;

+class SVE_2_Op_Imm_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
+                       ValueType vt2, Operand ImmTy, Instruction inst>
+: Pat<(vtd (op vt1:$Op1, (vt2 ImmTy:$Op2))),
+      (inst $Op1, ImmTy:$Op2)>;
+
 class SVE_3_Op_Imm_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
                       ValueType vt2, ValueType vt3, Operand ImmTy,
                       Instruction inst>
@ -2965,17 +2970,21 @@ class sve2_int_bin_shift_imm_narrow_bottom<bits<3> tsz8_64, bits<3> opc,
  let Inst{4-0}   = Zd;
 }

-multiclass sve2_int_bin_shift_imm_right_narrow_bottom<bits<3> opc, string asm> {
+multiclass sve2_int_bin_shift_imm_right_narrow_bottom<bits<3> opc, string asm,
+                                                      SDPatternOperator op> {
  def _B : sve2_int_bin_shift_imm_narrow_bottom<{0,0,1}, opc, asm, ZPR8, ZPR16,
-                                                vecshiftR8>;
+                                                tvecshiftR8>;
  def _H : sve2_int_bin_shift_imm_narrow_bottom<{0,1,?}, opc, asm, ZPR16, ZPR32,
-                                                vecshiftR16> {
+                                                tvecshiftR16> {
    let Inst{19} = imm{3};
  }
  def _S : sve2_int_bin_shift_imm_narrow_bottom<{1,?,?}, opc, asm, ZPR32, ZPR64,
                                                vecshiftR32> {
    let Inst{20-19} = imm{4-3};
  }
+  def : SVE_2_Op_Imm_Pat<nxv16i8, op, nxv8i16, i32, tvecshiftR8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_2_Op_Imm_Pat<nxv8i16, op, nxv4i32, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Imm_Pat<nxv4i32, op, nxv2i64, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
 }

 class sve2_int_bin_shift_imm_narrow_top<bits<3> tsz8_64, bits<3> opc,
@ -3001,17 +3010,21 @@ class sve2_int_bin_shift_imm_narrow_top<bits<3> tsz8_64, bits<3> opc,
  let Constraints = "$Zd = $_Zd";
 }

-multiclass sve2_int_bin_shift_imm_right_narrow_top<bits<3> opc, string asm> {
+multiclass sve2_int_bin_shift_imm_right_narrow_top<bits<3> opc, string asm,
+                                                   SDPatternOperator op> {
  def _B : sve2_int_bin_shift_imm_narrow_top<{0,0,1}, opc, asm, ZPR8, ZPR16,
-                                             vecshiftR8>;
+                                             tvecshiftR8>;
  def _H : sve2_int_bin_shift_imm_narrow_top<{0,1,?}, opc, asm, ZPR16, ZPR32,
-                                             vecshiftR16> {
+                                             tvecshiftR16> {
    let Inst{19} = imm{3};
  }
  def _S : sve2_int_bin_shift_imm_narrow_top<{1,?,?}, opc, asm, ZPR32, ZPR64,
                                             vecshiftR32> {
    let Inst{20-19} = imm{4-3};
  }
+  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv8i16, i32, tvecshiftR8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv4i32, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv2i64, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
 }

 class sve2_int_addsub_narrow_high_bottom<bits<2> sz, bits<2> opc, string asm,
--- a/test/CodeGen/AArch64/sve2-intrinsics-binary-narrowing-shr.ll
+++ b/test/CodeGen/AArch64/sve2-intrinsics-binary-narrowing-shr.ll
@ -0,0 +1,512 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2 < %s | FileCheck %s
+
+;
+; SHRNB
+;
+
+define <vscale x 16 x i8> @shrnb_h(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: shrnb_h:
+; CHECK: shrnb z0.b, z0.h, #8
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.shrnb.nxv8i16(<vscale x 8 x i16> %a,
+                                                                 i32 8)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @shrnb_s(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: shrnb_s:
+; CHECK: shrnb z0.h, z0.s, #16
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.shrnb.nxv4i32(<vscale x 4 x i32> %a,
+                                                                 i32 16)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @shrnb_d(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: shrnb_d:
+; CHECK: shrnb z0.s, z0.d, #32
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.shrnb.nxv2i64(<vscale x 2 x i64> %a,
+                                                                 i32 32)
+  ret <vscale x 4 x i32> %out
+}
+
+;
+; UQSHRNB
+;
+
+define <vscale x 16 x i8> @uqshrnb_h(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: uqshrnb_h:
+; CHECK: uqshrnb z0.b, z0.h, #1
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.uqshrnb.nxv8i16(<vscale x 8 x i16> %a,
+                                                                   i32 1)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @uqshrnb_s(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: uqshrnb_s:
+; CHECK: uqshrnb z0.h, z0.s, #1
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.uqshrnb.nxv4i32(<vscale x 4 x i32> %a,
+                                                                   i32 1)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @uqshrnb_d(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: uqshrnb_d:
+; CHECK: uqshrnb z0.s, z0.d, #1
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uqshrnb.nxv2i64(<vscale x 2 x i64> %a,
+                                                                   i32 1)
+  ret <vscale x 4 x i32> %out
+}
+
+;
+; SQSHRNB
+;
+
+define <vscale x 16 x i8> @sqshrnb_h(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: sqshrnb_h:
+; CHECK: sqshrnb z0.b, z0.h, #1
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.sqshrnb.nxv8i16(<vscale x 8 x i16> %a,
+                                                                   i32 1)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @sqshrnb_s(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: sqshrnb_s:
+; CHECK: sqshrnb z0.h, z0.s, #1
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sqshrnb.nxv4i32(<vscale x 4 x i32> %a,
+                                                                   i32 1)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @sqshrnb_d(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: sqshrnb_d:
+; CHECK: sqshrnb z0.s, z0.d, #1
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sqshrnb.nxv2i64(<vscale x 2 x i64> %a,
+                                                                   i32 1)
+  ret <vscale x 4 x i32> %out
+}
+
+;
+; SQSHRUNB
+;
+
+define <vscale x 16 x i8> @sqshrunb_h(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: qshrunb_h:
+; CHECK: sqshrunb z0.b, z0.h, #7
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.sqshrunb.nxv8i16(<vscale x 8 x i16> %a,
+                                                                    i32 7)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @sqshrunb_s(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: sqshrunb_s:
+; CHECK: sqshrunb z0.h, z0.s, #15
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sqshrunb.nxv4i32(<vscale x 4 x i32> %a,
+                                                                    i32 15)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @sqshrunb_d(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: sqshrunb_d:
+; CHECK: sqshrunb z0.s, z0.d, #31
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sqshrunb.nxv2i64(<vscale x 2 x i64> %a,
+                                                                    i32 31)
+  ret <vscale x 4 x i32> %out
+}
+
+;
+; UQRSHRNB
+;
+
+define <vscale x 16 x i8> @uqrshrnb_h(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: uqrshrnb_h:
+; CHECK: uqrshrnb z0.b, z0.h, #2
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.uqrshrnb.nxv8i16(<vscale x 8 x i16> %a,
+                                                                    i32 2)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @uqrshrnb_s(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: uqrshrnb_s:
+; CHECK: uqrshrnb z0.h, z0.s, #2
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.uqrshrnb.nxv4i32(<vscale x 4 x i32> %a,
+                                                                    i32 2)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @uqrshrnb_d(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: uqrshrnb_d:
+; CHECK: uqrshrnb z0.s, z0.d, #2
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uqrshrnb.nxv2i64(<vscale x 2 x i64> %a,
+                                                                    i32 2)
+  ret <vscale x 4 x i32> %out
+}
+
+;
+; SQRSHRNB
+;
+
+define <vscale x 16 x i8> @sqrshrnb_h(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: sqrshrnb_h:
+; CHECK: sqrshrnb z0.b, z0.h, #2
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.sqrshrnb.nxv8i16(<vscale x 8 x i16> %a,
+                                                                    i32 2)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @sqrshrnb_s(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: sqrshrnb_s:
+; CHECK: sqrshrnb z0.h, z0.s, #2
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sqrshrnb.nxv4i32(<vscale x 4 x i32> %a,
+                                                                    i32 2)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @sqrshrnb_d(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: sqrshrnb_d:
+; CHECK: sqrshrnb z0.s, z0.d, #2
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sqrshrnb.nxv2i64(<vscale x 2 x i64> %a,
+                                                                    i32 2)
+  ret <vscale x 4 x i32> %out
+}
+
+;
+; SQRSHRUNB
+;
+
+define <vscale x 16 x i8> @sqrshrunb_h(<vscale x 8 x i16> %a) {
+; CHECK-LABEL: sqrshrunb_h:
+; CHECK: sqrshrunb z0.b, z0.h, #6
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.sqrshrunb.nxv8i16(<vscale x 8 x i16> %a,
+                                                                     i32 6)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @sqrshrunb_s(<vscale x 4 x i32> %a) {
+; CHECK-LABEL: sqrshrunb_s:
+; CHECK: sqrshrunb z0.h, z0.s, #14
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sqrshrunb.nxv4i32(<vscale x 4 x i32> %a,
+                                                                     i32 14)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @sqrshrunb_d(<vscale x 2 x i64> %a) {
+; CHECK-LABEL: sqrshrunb_d:
+; CHECK: sqrshrunb z0.s, z0.d, #30
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sqrshrunb.nxv2i64(<vscale x 2 x i64> %a,
+                                                                     i32 30)
+  ret <vscale x 4 x i32> %out
+}
+
+;
+; SHRNT
+;
+
+define <vscale x 16 x i8> @shrnt_h(<vscale x 16 x i8> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: shrnt_h:
+; CHECK: shrnt z0.b, z1.h, #3
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.shrnt.nxv8i16(<vscale x 16 x i8> %a,
+                                                                 <vscale x 8 x i16> %b,
+                                                                 i32 3)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @shrnt_s(<vscale x 8 x i16> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: shrnt_s:
+; CHECK: shrnt z0.h, z1.s, #3
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.shrnt.nxv4i32(<vscale x 8 x i16> %a,
+                                                                 <vscale x 4 x i32> %b,
+                                                                 i32 3)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @shrnt_d(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: shrnt_d:
+; CHECK: shrnt z0.s, z1.d, #3
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.shrnt.nxv2i64(<vscale x 4 x i32> %a,
+                                                                 <vscale x 2 x i64> %b,
+                                                                 i32 3)
+  ret <vscale x 4 x i32> %out
+}
+
+;
+; UQSHRNT
+;
+
+define <vscale x 16 x i8> @uqshrnt_h(<vscale x 16 x i8> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: uqshrnt_h:
+; CHECK: uqshrnt z0.b, z1.h, #5
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.uqshrnt.nxv8i16(<vscale x 16 x i8> %a,
+                                                                   <vscale x 8 x i16> %b,
+                                                                   i32 5)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @uqshrnt_s(<vscale x 8 x i16> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: uqshrnt_s:
+; CHECK: uqshrnt z0.h, z1.s, #13
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.uqshrnt.nxv4i32(<vscale x 8 x i16> %a,
+                                                                   <vscale x 4 x i32> %b,
+                                                                   i32 13)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @uqshrnt_d(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: uqshrnt_d:
+; CHECK: uqshrnt z0.s, z1.d, #29
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uqshrnt.nxv2i64(<vscale x 4 x i32> %a,
+                                                                   <vscale x 2 x i64> %b,
+                                                                   i32 29)
+  ret <vscale x 4 x i32> %out
+}
+
+;
+; SQSHRNT
+;
+
+define <vscale x 16 x i8> @sqshrnt_h(<vscale x 16 x i8> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: sqshrnt_h:
+; CHECK: sqshrnt z0.b, z1.h, #5
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.sqshrnt.nxv8i16(<vscale x 16 x i8> %a,
+                                                                   <vscale x 8 x i16> %b,
+                                                                   i32 5)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @sqshrnt_s(<vscale x 8 x i16> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: sqshrnt_s:
+; CHECK: sqshrnt z0.h, z1.s, #13
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sqshrnt.nxv4i32(<vscale x 8 x i16> %a,
+                                                                   <vscale x 4 x i32> %b,
+                                                                   i32 13)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @sqshrnt_d(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sqshrnt_d:
+; CHECK: sqshrnt z0.s, z1.d, #29
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sqshrnt.nxv2i64(<vscale x 4 x i32> %a,
+                                                                   <vscale x 2 x i64> %b,
+                                                                   i32 29)
+  ret <vscale x 4 x i32> %out
+}
+
+;
+; SQSHRUNT
+;
+
+define <vscale x 16 x i8> @sqshrunt_h(<vscale x 16 x i8> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: sqshrunt_h:
+; CHECK: sqshrunt z0.b, z1.h, #4
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.sqshrunt.nxv8i16(<vscale x 16 x i8> %a,
+                                                                    <vscale x 8 x i16> %b,
+                                                                    i32 4)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @sqshrunt_s(<vscale x 8 x i16> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: sqshrunt_s:
+; CHECK: sqshrunt z0.h, z1.s, #4
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sqshrunt.nxv4i32(<vscale x 8 x i16> %a,
+                                                                    <vscale x 4 x i32> %b,
+                                                                    i32 4)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @sqshrunt_d(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sqshrunt_d:
+; CHECK: sqshrunt z0.s, z1.d, #4
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sqshrunt.nxv2i64(<vscale x 4 x i32> %a,
+                                                                    <vscale x 2 x i64> %b,
+                                                                    i32 4)
+  ret <vscale x 4 x i32> %out
+}
+
+;
+; UQRSHRNT
+;
+
+define <vscale x 16 x i8> @uqrshrnt_h(<vscale x 16 x i8> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: uqrshrnt_h:
+; CHECK: uqrshrnt z0.b, z1.h, #8
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.uqrshrnt.nxv8i16(<vscale x 16 x i8> %a,
+                                                                    <vscale x 8 x i16> %b,
+                                                                    i32 8)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @uqrshrnt_s(<vscale x 8 x i16> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: uqrshrnt_s:
+; CHECK: uqrshrnt z0.h, z1.s, #12
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.uqrshrnt.nxv4i32(<vscale x 8 x i16> %a,
+                                                                    <vscale x 4 x i32> %b,
+                                                                    i32 12)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @uqrshrnt_d(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: uqrshrnt_d:
+; CHECK: uqrshrnt z0.s, z1.d, #28
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.uqrshrnt.nxv2i64(<vscale x 4 x i32> %a,
+                                                                    <vscale x 2 x i64> %b,
+                                                                    i32 28)
+  ret <vscale x 4 x i32> %out
+}
+
+;
+; SQRSHRNT
+;
+
+define <vscale x 16 x i8> @sqrshrnt_h(<vscale x 16 x i8> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: sqrshrnt_h:
+; CHECK: sqrshrnt z0.b, z1.h, #8
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.sqrshrnt.nxv8i16(<vscale x 16 x i8> %a,
+                                                                    <vscale x 8 x i16> %b,
+                                                                    i32 8)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @sqrshrnt_s(<vscale x 8 x i16> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: sqrshrnt_s:
+; CHECK: sqrshrnt z0.h, z1.s, #12
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sqrshrnt.nxv4i32(<vscale x 8 x i16> %a,
+                                                                    <vscale x 4 x i32> %b,
+                                                                    i32 12)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @sqrshrnt_d(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sqrshrnt_d:
+; CHECK: sqrshrnt z0.s, z1.d, #28
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sqrshrnt.nxv2i64(<vscale x 4 x i32> %a,
+                                                                    <vscale x 2 x i64> %b,
+                                                                    i32 28)
+  ret <vscale x 4 x i32> %out
+}
+
+;
+; SQRSHRUNT
+;
+
+define <vscale x 16 x i8> @sqrshrunt_h(<vscale x 16 x i8> %a, <vscale x 8 x i16> %b) {
+; CHECK-LABEL: sqrshrunt_h:
+; CHECK: sqrshrunt z0.b, z1.h, #1
+; CHECK-NEXT: ret
+  %out = call <vscale x 16 x i8> @llvm.aarch64.sve.sqrshrunt.nxv8i16(<vscale x 16 x i8> %a,
+                                                                     <vscale x 8 x i16> %b,
+                                                                     i32 1)
+  ret <vscale x 16 x i8> %out
+}
+
+define <vscale x 8 x i16> @sqrshrunt_s(<vscale x 8 x i16> %a, <vscale x 4 x i32> %b) {
+; CHECK-LABEL: sqrshrunt_s:
+; CHECK: sqrshrunt z0.h, z1.s, #5
+; CHECK-NEXT: ret
+  %out = call <vscale x 8 x i16> @llvm.aarch64.sve.sqrshrunt.nxv4i32(<vscale x 8 x i16> %a,
+                                                                     <vscale x 4 x i32> %b,
+                                                                     i32 5)
+  ret <vscale x 8 x i16> %out
+}
+
+define <vscale x 4 x i32> @sqrshrunt_d(<vscale x 4 x i32> %a, <vscale x 2 x i64> %b) {
+; CHECK-LABEL: sqrshrunt_d:
+; CHECK: sqrshrunt z0.s, z1.d, #5
+; CHECK-NEXT: ret
+  %out = call <vscale x 4 x i32> @llvm.aarch64.sve.sqrshrunt.nxv2i64(<vscale x 4 x i32> %a,
+                                                                     <vscale x 2 x i64> %b,
+                                                                     i32 5)
+  ret <vscale x 4 x i32> %out
+}
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.shrnb.nxv8i16(<vscale x 8 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.shrnb.nxv4i32(<vscale x 4 x i32>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.shrnb.nxv2i64(<vscale x 2 x i64>, i32)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.uqshrnb.nxv8i16(<vscale x 8 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.uqshrnb.nxv4i32(<vscale x 4 x i32>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.uqshrnb.nxv2i64(<vscale x 2 x i64>, i32)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.sqshrnb.nxv8i16(<vscale x 8 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.sqshrnb.nxv4i32(<vscale x 4 x i32>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.sqshrnb.nxv2i64(<vscale x 2 x i64>, i32)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.uqrshrnb.nxv8i16(<vscale x 8 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.uqrshrnb.nxv4i32(<vscale x 4 x i32>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.uqrshrnb.nxv2i64(<vscale x 2 x i64>, i32)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.sqrshrnb.nxv8i16(<vscale x 8 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.sqrshrnb.nxv4i32(<vscale x 4 x i32>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.sqrshrnb.nxv2i64(<vscale x 2 x i64>, i32)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.sqshrunb.nxv8i16(<vscale x 8 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.sqshrunb.nxv4i32(<vscale x 4 x i32>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.sqshrunb.nxv2i64(<vscale x 2 x i64>, i32)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.sqrshrunb.nxv8i16(<vscale x 8 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.sqrshrunb.nxv4i32(<vscale x 4 x i32>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.sqrshrunb.nxv2i64(<vscale x 2 x i64>, i32)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.shrnt.nxv8i16(<vscale x 16 x i8>, <vscale x 8 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.shrnt.nxv4i32(<vscale x 8 x i16>, <vscale x 4 x i32>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.shrnt.nxv2i64(<vscale x 4 x i32>, <vscale x 2 x i64>, i32)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.uqshrnt.nxv8i16(<vscale x 16 x i8>, <vscale x 8 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.uqshrnt.nxv4i32(<vscale x 8 x i16>, <vscale x 4 x i32>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.uqshrnt.nxv2i64(<vscale x 4 x i32>, <vscale x 2 x i64>, i32)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.sqshrnt.nxv8i16(<vscale x 16 x i8>, <vscale x 8 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.sqshrnt.nxv4i32(<vscale x 8 x i16>, <vscale x 4 x i32>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.sqshrnt.nxv2i64(<vscale x 4 x i32>, <vscale x 2 x i64>, i32)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.sqshrunt.nxv8i16(<vscale x 16 x i8>, <vscale x 8 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.sqshrunt.nxv4i32(<vscale x 8 x i16>, <vscale x 4 x i32>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.sqshrunt.nxv2i64(<vscale x 4 x i32>, <vscale x 2 x i64>, i32)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.uqrshrnt.nxv8i16(<vscale x 16 x i8>, <vscale x 8 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.uqrshrnt.nxv4i32(<vscale x 8 x i16>, <vscale x 4 x i32>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.uqrshrnt.nxv2i64(<vscale x 4 x i32>, <vscale x 2 x i64>, i32)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.sqrshrnt.nxv8i16(<vscale x 16 x i8>, <vscale x 8 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.sqrshrnt.nxv4i32(<vscale x 8 x i16>, <vscale x 4 x i32>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.sqrshrnt.nxv2i64(<vscale x 4 x i32>, <vscale x 2 x i64>, i32)
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.sqrshrunt.nxv8i16(<vscale x 16 x i8>, <vscale x 8 x i16>, i32)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.sqrshrunt.nxv4i32(<vscale x 8 x i16>, <vscale x 4 x i32>, i32)
+declare <vscale x 4 x i32> @llvm.aarch64.sve.sqrshrunt.nxv2i64(<vscale x 4 x i32>, <vscale x 2 x i64>, i32)