[VP,Integer,#1] Vector-predicated integer intrinsics

Summary: This patch adds IR intrinsics for vector-predicated integer arithmetic. It is subpatch #1 of the [integer slice](https://reviews.llvm.org/D57504#1732277) of [LLVM-VP](https://reviews.llvm.org/D57504). LLVM-VP is a larger effort to bring native vector predication to LLVM. Reviewed By: andrew.w.kaylor Differential Revision: https://reviews.llvm.org/D69891
2024-11-25 12:12:47 +01:00 · 2020-03-17 14:52:06 +01:00 · 2020-03-17 14:52:06 +01:00 · 4405e5770f
commit 4405e5770f
parent b3d85ce4e1
13 changed files with 1258 additions and 4 deletions
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@ -7727,6 +7727,8 @@ Example:

      <result> = fadd float 4.0, %var          ; yields float:result = 4.0 + %var

+.. _i_sub:
+
 '``sub``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^

@ -7822,6 +7824,8 @@ Example:
      <result> = fsub float 4.0, %var           ; yields float:result = 4.0 - %var
      <result> = fsub float -0.0, %val          ; yields float:result = -%var

+.. _i_mul:
+
 '``mul``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^

@ -7916,6 +7920,8 @@ Example:

      <result> = fmul float 4.0, %var          ; yields float:result = 4.0 * %var

+.. _i_udiv:
+
 '``udiv``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^^

@ -7962,6 +7968,8 @@ Example:

      <result> = udiv i32 4, %var          ; yields i32:result = 4 / %var

+.. _i_sdiv:
+
 '``sdiv``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^^

@ -8050,6 +8058,8 @@ Example:

      <result> = fdiv float 4.0, %var          ; yields float:result = 4.0 / %var

+.. _i_urem:
+
 '``urem``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^^

@ -8094,6 +8104,8 @@ Example:

      <result> = urem i32 4, %var          ; yields i32:result = 4 % %var

+.. _i_srem:
+
 '``srem``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^^

@ -8207,6 +8219,8 @@ commonly be strength reduced from other instructions. They require two
 operands of the same type, execute an operation on them, and produce a
 single value. The resulting value is the same type as its operands.

+.. _i_shl:
+
 '``shl``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^

@ -8259,6 +8273,9 @@ Example:
      <result> = shl i32 1, 32     ; undefined
      <result> = shl <2 x i32> < i32 1, i32 1>, < i32 1, i32 2>   ; yields: result=<2 x i32> < i32 2, i32 4>

+.. _i_lshr:
+
+
 '``lshr``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^^

@ -8308,6 +8325,8 @@ Example:
      <result> = lshr i32 1, 32  ; undefined
      <result> = lshr <2 x i32> < i32 -2, i32 4>, < i32 1, i32 2>   ; yields: result=<2 x i32> < i32 0x7FFFFFFF, i32 1>

+.. _i_ashr:
+
 '``ashr``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^^

@ -8358,6 +8377,8 @@ Example:
      <result> = ashr i32 1, 32  ; undefined
      <result> = ashr <2 x i32> < i32 -2, i32 4>, < i32 1, i32 3>   ; yields: result=<2 x i32> < i32 -1, i32 0>

+.. _i_and:
+
 '``and``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^

@ -8407,6 +8428,8 @@ Example:
      <result> = and i32 15, 40          ; yields i32:result = 8
      <result> = and i32 4, 8            ; yields i32:result = 0

+.. _i_or:
+
 '``or``' Instruction
 ^^^^^^^^^^^^^^^^^^^^

@ -8456,6 +8479,8 @@ Example:
      <result> = or i32 15, 40          ; yields i32:result = 47
      <result> = or i32 4, 8            ; yields i32:result = 12

+.. _i_xor:
+
 '``xor``' Instruction
 ^^^^^^^^^^^^^^^^^^^^^

@ -15259,6 +15284,678 @@ intrinsic returns the executable address corresponding to ``tramp``
 after performing the required machine specific adjustments. The pointer
 returned can then be :ref:`bitcast and executed <int_trampoline>`.

+
+.. _int_vp:
+
+Vector Predication Intrinsics
+-----------------------------
+VP intrinsics are intended for predicated SIMD/vector code.  A typical VP
+operation takes a vector mask and an explicit vector length parameter as in:
+
+::
+
+      <W x T> llvm.vp.<opcode>.*(<W x T> %x, <W x T> %y, <W x i1> %mask, i32 %evl)
+
+The vector mask parameter (%mask) always has a vector of `i1` type, for example
+`<32 x i1>`.  The explicit vector length parameter always has the type `i32` and
+is an unsigned integer value.  The explicit vector length parameter (%evl) is in
+the range:
+
+::
+
+      0 <= %evl <= W,  where W is the number of vector elements
+
+Note that for :ref:`scalable vector types <t_vector>` ``W`` is the runtime
+length of the vector.
+
+The VP intrinsic has undefined behavior if ``%evl > W``.  The explicit vector
+length (%evl) creates a mask, %EVLmask, with all elements ``0 <= i < %evl`` set
+to True, and all other lanes ``%evl <= i < W`` to False.  A new mask %M is
+calculated with an element-wise AND from %mask and %EVLmask:
+
+::
+
+      M = %mask AND %EVLmask
+
+A vector operation ``<opcode>`` on vectors ``A`` and ``B`` calculates:
+
+::
+
+       A <opcode> B =  {  A[i] <opcode> B[i]   M[i] = True, and
+                       {  undef otherwise
+
+Optimization Hint
+^^^^^^^^^^^^^^^^^
+
+Some targets, such as AVX512, do not support the %evl parameter in hardware.
+The use of an effective %evl is discouraged for those targets.  The function
+``TargetTransformInfo::hasActiveVectorLength()`` returns true when the target
+has native support for %evl.
+
+
+.. _int_vp_add:
+
+'``llvm.vp.add.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.add.v16i32 (<16 x i32> <left_op>, <16 x i32> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.add.nxv4i32 (<vscale x 4 x i32> <left_op>, <vscale x 4 x i32> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.add.v256i64 (<256 x i64> <left_op>, <256 x i64> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated integer addition of two vectors of integers.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of integer type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.add``' intrinsic performs integer addition (:ref:`add <i_add>`)
+of the first and second vector operand on each enabled lane.  The result on
+disabled lanes is undefined.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = add <4 x i32> %a, %b
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef
+
+.. _int_vp_sub:
+
+'``llvm.vp.sub.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.sub.v16i32 (<16 x i32> <left_op>, <16 x i32> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.sub.nxv4i32 (<vscale x 4 x i32> <left_op>, <vscale x 4 x i32> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.sub.v256i64 (<256 x i64> <left_op>, <256 x i64> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated integer subtraction of two vectors of integers.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of integer type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.sub``' intrinsic performs integer subtraction
+(:ref:`sub <i_sub>`)  of the first and second vector operand on each enabled
+lane. The result on disabled lanes is undefined.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.sub.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = sub <4 x i32> %a, %b
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef
+
+
+
+.. _int_vp_mul:
+
+'``llvm.vp.mul.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.mul.v16i32 (<16 x i32> <left_op>, <16 x i32> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.mul.nxv46i32 (<vscale x 4 x i32> <left_op>, <vscale x 4 x i32> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.mul.v256i64 (<256 x i64> <left_op>, <256 x i64> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated integer multiplication of two vectors of integers.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of integer type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+The '``llvm.vp.mul``' intrinsic performs integer multiplication
+(:ref:`mul <i_mul>`) of the first and second vector operand on each enabled
+lane. The result on disabled lanes is undefined.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.mul.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = mul <4 x i32> %a, %b
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef
+
+
+.. _int_vp_sdiv:
+
+'``llvm.vp.sdiv.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.sdiv.v16i32 (<16 x i32> <left_op>, <16 x i32> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.sdiv.nxv4i32 (<vscale x 4 x i32> <left_op>, <vscale x 4 x i32> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.sdiv.v256i64 (<256 x i64> <left_op>, <256 x i64> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated, signed division of two vectors of integers.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of integer type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.sdiv``' intrinsic performs signed division (:ref:`sdiv <i_sdiv>`)
+of the first and second vector operand on each enabled lane.  The result on
+disabled lanes is undefined.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.sdiv.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = sdiv <4 x i32> %a, %b
+      %also.r = select <4 x ii> %mask, <4 x i32> %t, <4 x i32> undef
+
+
+.. _int_vp_udiv:
+
+'``llvm.vp.udiv.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.udiv.v16i32 (<16 x i32> <left_op>, <16 x i32> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.udiv.nxv4i32 (<vscale x 4 x i32> <left_op>, <vscale x 4 x i32> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.udiv.v256i64 (<256 x i64> <left_op>, <256 x i64> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated, unsigned division of two vectors of integers.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of integer type. The third operand is the vector mask and has the same number of elements as the result vector type. The fourth operand is the explicit vector length of the operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.udiv``' intrinsic performs unsigned division
+(:ref:`udiv <i_udiv>`) of the first and second vector operand on each enabled
+lane. The result on disabled lanes is undefined.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.udiv.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = udiv <4 x i32> %a, %b
+      %also.r = select <4 x ii> %mask, <4 x i32> %t, <4 x i32> undef
+
+
+
+.. _int_vp_srem:
+
+'``llvm.vp.srem.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.srem.v16i32 (<16 x i32> <left_op>, <16 x i32> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.srem.nxv4i32 (<vscale x 4 x i32> <left_op>, <vscale x 4 x i32> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.srem.v256i64 (<256 x i64> <left_op>, <256 x i64> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated computations of the signed remainder of two integer vectors.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of integer type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.srem``' intrinsic computes the remainder of the signed division
+(:ref:`srem <i_srem>`) of the first and second vector operand on each enabled
+lane.  The result on disabled lanes is undefined.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.srem.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = srem <4 x i32> %a, %b
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef
+
+
+
+.. _int_vp_urem:
+
+'``llvm.vp.urem.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.urem.v16i32 (<16 x i32> <left_op>, <16 x i32> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.urem.nxv4i32 (<vscale x 4 x i32> <left_op>, <vscale x 4 x i32> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.urem.v256i64 (<256 x i64> <left_op>, <256 x i64> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated computation of the unsigned remainder of two integer vectors.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of integer type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.urem``' intrinsic computes the remainder of the unsigned division
+(:ref:`urem <i_urem>`) of the first and second vector operand on each enabled
+lane.  The result on disabled lanes is undefined.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.urem.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = urem <4 x i32> %a, %b
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef
+
+
+.. _int_vp_ashr:
+
+'``llvm.vp.ashr.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.ashr.v16i32 (<16 x i32> <left_op>, <16 x i32> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.ashr.nxv4i32 (<vscale x 4 x i32> <left_op>, <vscale x 4 x i32> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.ashr.v256i64 (<256 x i64> <left_op>, <256 x i64> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Vector-predicated arithmetic right-shift.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of integer type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.ashr``' intrinsic computes the arithmetic right shift
+(:ref:`ashr <i_ashr>`) of the first operand by the second operand on each
+enabled lane. The result on disabled lanes is undefined.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.ashr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = ashr <4 x i32> %a, %b
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef
+
+
+.. _int_vp_lshr:
+
+
+'``llvm.vp.lshr.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.lshr.v16i32 (<16 x i32> <left_op>, <16 x i32> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.lshr.nxv4i32 (<vscale x 4 x i32> <left_op>, <vscale x 4 x i32> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.lshr.v256i64 (<256 x i64> <left_op>, <256 x i64> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Vector-predicated logical right-shift.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of integer type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.lshr``' intrinsic computes the logical right shift
+(:ref:`lshr <i_lshr>`) of the first operand by the second operand on each
+enabled lane. The result on disabled lanes is undefined.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.lshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = lshr <4 x i32> %a, %b
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef
+
+
+.. _int_vp_shl:
+
+'``llvm.vp.shl.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.shl.v16i32 (<16 x i32> <left_op>, <16 x i32> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.shl.nxv4i32 (<vscale x 4 x i32> <left_op>, <vscale x 4 x i32> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.shl.v256i64 (<256 x i64> <left_op>, <256 x i64> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Vector-predicated left shift.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of integer type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.shl``' intrinsic computes the left shift (:ref:`shl <i_shl>`) of
+the first operand by the second operand on each enabled lane.  The result on
+disabled lanes is undefined.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.shl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = shl <4 x i32> %a, %b
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef
+
+
+.. _int_vp_or:
+
+'``llvm.vp.or.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.or.v16i32 (<16 x i32> <left_op>, <16 x i32> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.or.nxv4i32 (<vscale x 4 x i32> <left_op>, <vscale x 4 x i32> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.or.v256i64 (<256 x i64> <left_op>, <256 x i64> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Vector-predicated or.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of integer type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.or``' intrinsic performs a bitwise or (:ref:`or <i_or>`) of the
+first two operands on each enabled lane.  The result on disabled lanes is
+undefined.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.or.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = or <4 x i32> %a, %b
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef
+
+
+.. _int_vp_and:
+
+'``llvm.vp.and.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.and.v16i32 (<16 x i32> <left_op>, <16 x i32> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.and.nxv4i32 (<vscale x 4 x i32> <left_op>, <vscale x 4 x i32> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.and.v256i64 (<256 x i64> <left_op>, <256 x i64> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Vector-predicated and.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of integer type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.and``' intrinsic performs a bitwise and (:ref:`and <i_or>`) of
+the first two operands on each enabled lane.  The result on disabled lanes is
+undefined.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.and.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = and <4 x i32> %a, %b
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef
+
+
+.. _int_vp_xor:
+
+'``llvm.vp.xor.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x i32>  @llvm.vp.xor.v16i32 (<16 x i32> <left_op>, <16 x i32> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x i32>  @llvm.vp.xor.nxv4i32 (<vscale x 4 x i32> <left_op>, <vscale x 4 x i32> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x i64>  @llvm.vp.xor.v256i64 (<256 x i64> <left_op>, <256 x i64> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Vector-predicated, bitwise xor.
+
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of integer type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.xor``' intrinsic performs a bitwise xor (:ref:`xor <i_xor>`) of
+the first two operands on each enabled lane.
+The result on disabled lanes is undefined.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x i32> @llvm.vp.xor.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = xor <4 x i32> %a, %b
+      %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef
+
+
 .. _int_mload_mstore:

 Masked Vector Load and Store Intrinsics
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@ -1167,6 +1167,15 @@ public:
  /// to a stack reload.
  unsigned getGISelRematGlobalCost() const;

+  /// \name Vector Predication Information
+  /// @{
+  /// Whether the target supports the %evl parameter of VP intrinsic efficiently in hardware.
+  /// (see LLVM Language Reference - "Vector Predication Intrinsics")
+  /// Use of %evl is discouraged when that is not the case.
+  bool hasActiveVectorLength() const;
+
+  /// @}
+
  /// @}

 private:
@ -1420,6 +1429,7 @@ public:
                                     ReductionFlags) const = 0;
  virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0;
  virtual unsigned getGISelRematGlobalCost() const = 0;
+  virtual bool hasActiveVectorLength() const = 0;
  virtual int getInstructionLatency(const Instruction *I) = 0;
 };

@ -1913,6 +1923,10 @@ public:
    return Impl.getGISelRematGlobalCost();
  }

+  bool hasActiveVectorLength() const override {
+    return Impl.hasActiveVectorLength();
+  }
+
  int getInstructionLatency(const Instruction *I) override {
    return Impl.getInstructionLatency(I);
  }
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@ -628,6 +628,10 @@ public:
    return 1;
  }

+  bool hasActiveVectorLength() const {
+    return false;
+  }
+
 protected:
  // Obtain the minimum required size to hold the value (without the sign)
  // In case of a vector it returns the min required size for one element.
--- a/include/llvm/IR/IntrinsicInst.h
+++ b/include/llvm/IR/IntrinsicInst.h
@ -206,6 +206,48 @@ namespace llvm {
    /// @}
  };

+  /// This is the common base class for vector predication intrinsics.
+  class VPIntrinsic : public IntrinsicInst {
+  public:
+    static Optional<int> GetMaskParamPos(Intrinsic::ID IntrinsicID);
+    static Optional<int> GetVectorLengthParamPos(Intrinsic::ID IntrinsicID);
+
+    /// The llvm.vp.* intrinsics for this instruction Opcode
+    static Intrinsic::ID GetForOpcode(unsigned OC);
+
+    // Whether \p ID is a VP intrinsic ID.
+    static bool IsVPIntrinsic(Intrinsic::ID);
+
+    /// \return the mask parameter or nullptr.
+    Value *getMaskParam() const;
+
+    /// \return the vector length parameter or nullptr.
+    Value *getVectorLengthParam() const;
+
+    /// \return whether the vector length param can be ignored.
+    bool canIgnoreVectorLengthParam() const;
+
+    /// \return the static element count (vector number of elements) the vector
+    /// length parameter applies to.
+    ElementCount getStaticVectorLength() const;
+
+    // Methods for support type inquiry through isa, cast, and dyn_cast:
+    static bool classof(const IntrinsicInst *I) {
+      return IsVPIntrinsic(I->getIntrinsicID());
+    }
+    static bool classof(const Value *V) {
+      return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+    }
+
+    // Equivalent non-predicated opcode
+    unsigned getFunctionalOpcode() const {
+      return GetFunctionalOpcodeForVP(getIntrinsicID());
+    }
+
+    // Equivalent non-predicated opcode
+    static unsigned GetFunctionalOpcodeForVP(Intrinsic::ID ID);
+  };
+
  /// This is the common base class for constrained floating point intrinsics.
  class ConstrainedFPIntrinsic : public IntrinsicInst {
  public:
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@ -27,6 +27,10 @@ class IntrinsicProperty;
 // effects.  It may be CSE'd deleted if dead, etc.
 def IntrNoMem : IntrinsicProperty;

+// IntrNoSync - Threads executing the intrinsic will not synchronize using
+// memory or other means.
+def IntrNoSync : IntrinsicProperty;
+
 // IntrReadMem - This intrinsic only reads from memory. It does not write to
 // memory and has no other side effects. Therefore, it cannot be moved across
 // potentially aliasing stores. However, it can be reordered otherwise and can
@ -1153,6 +1157,79 @@ def int_is_constant : Intrinsic<[llvm_i1_ty], [llvm_any_ty], [IntrNoMem, IntrWil
 def int_ptrmask: Intrinsic<[llvm_anyptr_ty], [llvm_anyptr_ty, llvm_anyint_ty],
                           [IntrNoMem, IntrSpeculatable, IntrWillReturn]>;

+//===---------------- Vector Predication Intrinsics --------------===//
+
+// Binary operators
+let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
+  def int_vp_add : Intrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_sub : Intrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_mul  : Intrinsic<[ llvm_anyvector_ty ],
+                              [ LLVMMatchType<0>,
+                                LLVMMatchType<0>,
+                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                llvm_i32_ty]>;
+  def int_vp_sdiv : Intrinsic<[ llvm_anyvector_ty ],
+                              [ LLVMMatchType<0>,
+                                LLVMMatchType<0>,
+                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                llvm_i32_ty]>;
+  def int_vp_udiv : Intrinsic<[ llvm_anyvector_ty ],
+                              [ LLVMMatchType<0>,
+                                LLVMMatchType<0>,
+                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                llvm_i32_ty]>;
+  def int_vp_srem : Intrinsic<[ llvm_anyvector_ty ],
+                              [ LLVMMatchType<0>,
+                                LLVMMatchType<0>,
+                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                llvm_i32_ty]>;
+  def int_vp_urem : Intrinsic<[ llvm_anyvector_ty ],
+                              [ LLVMMatchType<0>,
+                                LLVMMatchType<0>,
+                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                llvm_i32_ty]>;
+  def int_vp_ashr : Intrinsic<[ llvm_anyvector_ty ],
+                              [ LLVMMatchType<0>,
+                                LLVMMatchType<0>,
+                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                llvm_i32_ty]>;
+  def int_vp_lshr : Intrinsic<[ llvm_anyvector_ty ],
+                              [ LLVMMatchType<0>,
+                                LLVMMatchType<0>,
+                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                                llvm_i32_ty]>;
+  def int_vp_shl : Intrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_or : Intrinsic<[ llvm_anyvector_ty ],
+                            [ LLVMMatchType<0>,
+                              LLVMMatchType<0>,
+                              LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                              llvm_i32_ty]>;
+  def int_vp_and : Intrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_xor : Intrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+
+}
+
+
 //===-------------------------- Masked Intrinsics -------------------------===//
 //
 def int_masked_store : Intrinsic<[], [llvm_anyvector_ty,
--- a/include/llvm/IR/VPIntrinsics.def
+++ b/include/llvm/IR/VPIntrinsics.def
@ -0,0 +1,84 @@
+//===-- IR/VPIntrinsics.def - Describes llvm.vp.* Intrinsics -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains descriptions of the various Vector Predication intrinsics.
+// This is used as a central place for enumerating the different instructions
+// and should eventually be the place to put comments about the instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// NOTE: NO INCLUDE GUARD DESIRED!
+
+// Provide definitions of macros so that users of this file do not have to
+// define everything to use it...
+//
+#ifndef REGISTER_VP_INTRINSIC
+#define REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS)
+#endif
+
+// Map this VP intrinsic to its functional Opcode
+#ifndef HANDLE_VP_TO_OC
+#define HANDLE_VP_TO_OC(VPID, OC)
+#endif
+
+///// Integer Arithmetic /////
+
+// llvm.vp.add(x,y,mask,vlen)
+REGISTER_VP_INTRINSIC(vp_add, 2, 3)
+HANDLE_VP_TO_OC(vp_add, Add)
+
+// llvm.vp.and(x,y,mask,vlen)
+REGISTER_VP_INTRINSIC(vp_and, 2, 3)
+HANDLE_VP_TO_OC(vp_and, And)
+
+// llvm.vp.ashr(x,y,mask,vlen)
+REGISTER_VP_INTRINSIC(vp_ashr, 2, 3)
+HANDLE_VP_TO_OC(vp_ashr, AShr)
+
+// llvm.vp.lshr(x,y,mask,vlen)
+REGISTER_VP_INTRINSIC(vp_lshr, 2, 3)
+HANDLE_VP_TO_OC(vp_lshr, LShr)
+
+// llvm.vp.mul(x,y,mask,vlen)
+REGISTER_VP_INTRINSIC(vp_mul, 2, 3)
+HANDLE_VP_TO_OC(vp_mul, Mul)
+
+// llvm.vp.or(x,y,mask,vlen)
+REGISTER_VP_INTRINSIC(vp_or, 2, 3)
+HANDLE_VP_TO_OC(vp_or, Or)
+
+// llvm.vp.sdiv(x,y,mask,vlen)
+REGISTER_VP_INTRINSIC(vp_sdiv, 2, 3)
+HANDLE_VP_TO_OC(vp_sdiv, SDiv)
+
+// llvm.vp.shl(x,y,mask,vlen)
+REGISTER_VP_INTRINSIC(vp_shl, 2, 3)
+HANDLE_VP_TO_OC(vp_shl, Shl)
+
+// llvm.vp.srem(x,y,mask,vlen)
+REGISTER_VP_INTRINSIC(vp_srem, 2, 3)
+HANDLE_VP_TO_OC(vp_srem, SRem)
+
+// llvm.vp.sub(x,y,mask,vlen)
+REGISTER_VP_INTRINSIC(vp_sub, 2, 3)
+HANDLE_VP_TO_OC(vp_sub, Sub)
+
+// llvm.vp.udiv(x,y,mask,vlen)
+REGISTER_VP_INTRINSIC(vp_udiv, 2, 3)
+HANDLE_VP_TO_OC(vp_udiv, UDiv)
+
+// llvm.vp.urem(x,y,mask,vlen)
+REGISTER_VP_INTRINSIC(vp_urem, 2, 3)
+HANDLE_VP_TO_OC(vp_urem, URem)
+
+// llvm.vp.xor(x,y,mask,vlen)
+REGISTER_VP_INTRINSIC(vp_xor, 2, 3)
+HANDLE_VP_TO_OC(vp_xor, Xor)
+
+#undef REGISTER_VP_INTRINSIC
+#undef HANDLE_VP_TO_OC
--- a/lib/IR/IntrinsicInst.cpp
+++ b/lib/IR/IntrinsicInst.cpp
@ -28,6 +28,8 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;

@ -178,6 +180,140 @@ bool ConstrainedFPIntrinsic::classof(const IntrinsicInst *I) {
  }
 }

+ElementCount VPIntrinsic::getStaticVectorLength() const {
+  auto GetVectorLengthOfType = [](const Type *T) -> ElementCount {
+    auto VT = cast<VectorType>(T);
+    auto ElemCount = VT->getElementCount();
+    return ElemCount;
+  };
+
+  auto VPMask = getMaskParam();
+  return GetVectorLengthOfType(VPMask->getType());
+}
+
+Value *VPIntrinsic::getMaskParam() const {
+  auto maskPos = GetMaskParamPos(getIntrinsicID());
+  if (maskPos)
+    return getArgOperand(maskPos.getValue());
+  return nullptr;
+}
+
+Value *VPIntrinsic::getVectorLengthParam() const {
+  auto vlenPos = GetVectorLengthParamPos(getIntrinsicID());
+  if (vlenPos)
+    return getArgOperand(vlenPos.getValue());
+  return nullptr;
+}
+
+Optional<int> VPIntrinsic::GetMaskParamPos(Intrinsic::ID IntrinsicID) {
+  switch (IntrinsicID) {
+  default:
+    return None;
+
+#define REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS)                          \
+  case Intrinsic::VPID:                                                        \
+    return MASKPOS;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+}
+
+Optional<int> VPIntrinsic::GetVectorLengthParamPos(Intrinsic::ID IntrinsicID) {
+  switch (IntrinsicID) {
+  default:
+    return None;
+
+#define REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS)                          \
+  case Intrinsic::VPID:                                                        \
+    return VLENPOS;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+}
+
+bool VPIntrinsic::IsVPIntrinsic(Intrinsic::ID ID) {
+  switch (ID) {
+  default:
+    return false;
+
+#define REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS)                          \
+  case Intrinsic::VPID:                                                        \
+    break;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+  return true;
+}
+
+// Equivalent non-predicated opcode
+unsigned VPIntrinsic::GetFunctionalOpcodeForVP(Intrinsic::ID ID) {
+  switch (ID) {
+  default:
+    return Instruction::Call;
+
+#define HANDLE_VP_TO_OC(VPID, OC)                                              \
+  case Intrinsic::VPID:                                                        \
+    return Instruction::OC;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+}
+
+Intrinsic::ID VPIntrinsic::GetForOpcode(unsigned OC) {
+  switch (OC) {
+  default:
+    return Intrinsic::not_intrinsic;
+
+#define HANDLE_VP_TO_OC(VPID, OC)                                              \
+  case Instruction::OC:                                                        \
+    return Intrinsic::VPID;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+}
+
+bool VPIntrinsic::canIgnoreVectorLengthParam() const {
+  using namespace PatternMatch;
+
+  ElementCount EC = getStaticVectorLength();
+
+  // No vlen param - no lanes masked-off by it.
+  auto *VLParam = getVectorLengthParam();
+  if (!VLParam)
+    return true;
+
+  // Note that the VP intrinsic causes undefined behavior if the Explicit Vector
+  // Length parameter is strictly greater-than the number of vector elements of
+  // the operation. This function returns true when this is detected statically
+  // in the IR.
+
+  // Check whether "W == vscale * EC.Min"
+  if (EC.Scalable) {
+    // Undig the DL
+    auto ParMod = this->getModule();
+    if (!ParMod)
+      return false;
+    const auto &DL = ParMod->getDataLayout();
+
+    // Compare vscale patterns
+    uint64_t ParamFactor;
+    if (EC.Min > 1 &&
+        match(VLParam, m_c_BinOp(m_ConstantInt(ParamFactor), m_VScale(DL)))) {
+      return ParamFactor >= EC.Min;
+    }
+    if (match(VLParam, m_VScale(DL))) {
+      return ParamFactor;
+    }
+    return false;
+  }
+
+  // standard SIMD operation
+  auto VLConst = dyn_cast<ConstantInt>(VLParam);
+  if (!VLConst)
+    return false;
+
+  uint64_t VLNum = VLConst->getZExtValue();
+  if (VLNum >= EC.Min)
+    return true;
+
+  return false;
+}
+
 Instruction::BinaryOps BinaryOpIntrinsic::getBinaryOp() const {
  switch (getIntrinsicID()) {
    case Intrinsic::uadd_with_overflow:
--- a/test/Verifier/vp-intrinsics.ll
+++ b/test/Verifier/vp-intrinsics.ll
@ -0,0 +1,34 @@
+; RUN: opt --verify %s
+
+define void @test_vp_int(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) {
+  %r0 = call <8 x i32> @llvm.vp.add.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %r1 = call <8 x i32> @llvm.vp.sub.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %r2 = call <8 x i32> @llvm.vp.mul.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %r3 = call <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %r4 = call <8 x i32> @llvm.vp.srem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %r5 = call <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %r6 = call <8 x i32> @llvm.vp.urem.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %r7 = call <8 x i32> @llvm.vp.and.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %r8 = call <8 x i32> @llvm.vp.or.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %r9 = call <8 x i32> @llvm.vp.xor.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  %rA = call <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) 
+  %rB = call <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n) 
+  %rC = call <8 x i32> @llvm.vp.shl.v8i32(<8 x i32> %i0, <8 x i32> %i1, <8 x i1> %m, i32 %n)
+  ret void
+}
+
+; integer arith
+declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.mul.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.srem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.urem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+; bit arith
+declare <8 x i32> @llvm.vp.and.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.or.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.xor.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
+declare <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) 
+declare <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) 
+declare <8 x i32> @llvm.vp.shl.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)
--- a/unittests/IR/CMakeLists.txt
+++ b/unittests/IR/CMakeLists.txt
@ -41,6 +41,7 @@ add_llvm_unittest(IRTests
  ValueTest.cpp
  VectorTypesTest.cpp
  VerifierTest.cpp
+  VPIntrinsicTest.cpp
  WaymarkTest.cpp
  )

--- a/unittests/IR/VPIntrinsicTest.cpp
+++ b/unittests/IR/VPIntrinsicTest.cpp
@ -0,0 +1,151 @@
+//===- VPIntrinsicTest.cpp - VPIntrinsic unit tests ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/SourceMgr.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+
+class VPIntrinsicTest : public testing::Test {
+protected:
+  LLVMContext Context;
+
+  VPIntrinsicTest() : Context() {}
+
+  LLVMContext C;
+  SMDiagnostic Err;
+
+  std::unique_ptr<Module> CreateVPDeclarationModule() {
+      return parseAssemblyString(
+" declare <8 x i32> @llvm.vp.add.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) "
+" declare <8 x i32> @llvm.vp.sub.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) "
+" declare <8 x i32> @llvm.vp.mul.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) "
+" declare <8 x i32> @llvm.vp.sdiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) "
+" declare <8 x i32> @llvm.vp.srem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) "
+" declare <8 x i32> @llvm.vp.udiv.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) "
+" declare <8 x i32> @llvm.vp.urem.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) "
+" declare <8 x i32> @llvm.vp.and.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) "
+" declare <8 x i32> @llvm.vp.xor.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) "
+" declare <8 x i32> @llvm.vp.or.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) "
+" declare <8 x i32> @llvm.vp.ashr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)  "
+" declare <8 x i32> @llvm.vp.lshr.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32)  "
+" declare <8 x i32> @llvm.vp.shl.v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) ",
+          Err, C);
+  }
+};
+
+/// Check that VPIntrinsic:canIgnoreVectorLengthParam() returns true
+/// if the vector length parameter does not mask off any lanes.
+TEST_F(VPIntrinsicTest, CanIgnoreVectorLength) {
+  LLVMContext C;
+  SMDiagnostic Err;
+
+  std::unique_ptr<Module> M =
+      parseAssemblyString(
+"declare <256 x i64> @llvm.vp.mul.v256i64(<256 x i64>, <256 x i64>, <256 x i1>, i32)"
+"declare <vscale x 2 x i64> @llvm.vp.mul.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i1>, i32)"
+"declare i32 @llvm.vscale.i32()"
+"define void @test_static_vlen( "
+"      <256 x i64> %i0, <vscale x 2 x i64> %si0,"
+"      <256 x i64> %i1, <vscale x 2 x i64> %si1,"
+"      <256 x i1> %m, <vscale x 2 x i1> %sm, i32 %vl) { "
+"  %r0 = call <256 x i64> @llvm.vp.mul.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 %vl)"
+"  %r1 = call <256 x i64> @llvm.vp.mul.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 256)"
+"  %r2 = call <256 x i64> @llvm.vp.mul.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 0)"
+"  %r3 = call <256 x i64> @llvm.vp.mul.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 7)"
+"  %r4 = call <256 x i64> @llvm.vp.mul.v256i64(<256 x i64> %i0, <256 x i64> %i1, <256 x i1> %m, i32 123)"
+"  %vs = call i32 @llvm.vscale.i32()"
+"  %vs.i64 = mul i32 %vs, 2"
+"  %r5 = call <vscale x 2 x i64> @llvm.vp.mul.nxv2i64(<vscale x 2 x i64> %si0, <vscale x 2 x i64> %si1, <vscale x 2 x i1> %sm, i32 %vs.i64)"
+"  %r6 = call <vscale x 2 x i64> @llvm.vp.mul.nxv2i64(<vscale x 2 x i64> %si0, <vscale x 2 x i64> %si1, <vscale x 2 x i1> %sm, i32 99999)"
+"  ret void "
+"}",
+          Err, C);
+
+  auto *F = M->getFunction("test_static_vlen");
+  assert(F);
+
+  const int NumExpected = 7;
+  const bool Expected[] = {false, true, false, false, false, true, false};
+  int i = 0;
+  for (auto &I : F->getEntryBlock()) {
+    VPIntrinsic *VPI = dyn_cast<VPIntrinsic>(&I);
+    if (!VPI)
+      continue;
+
+    ASSERT_LT(i, NumExpected);
+    ASSERT_EQ(Expected[i], VPI->canIgnoreVectorLengthParam());
+    ++i;
+  }
+}
+
+/// Check that the argument returned by
+/// VPIntrinsic::Get<X>ParamPos(Intrinsic::ID) has the expected type.
+TEST_F(VPIntrinsicTest, GetParamPos) {
+  std::unique_ptr<Module> M = CreateVPDeclarationModule();
+  assert(M);
+
+  for (Function &F : *M) {
+    ASSERT_TRUE(F.isIntrinsic());
+    Optional<int> MaskParamPos =
+        VPIntrinsic::GetMaskParamPos(F.getIntrinsicID());
+    if (MaskParamPos.hasValue()) {
+      Type *MaskParamType = F.getArg(MaskParamPos.getValue())->getType();
+      ASSERT_TRUE(MaskParamType->isVectorTy());
+      ASSERT_TRUE(MaskParamType->getVectorElementType()->isIntegerTy(1));
+    }
+
+    Optional<int> VecLenParamPos =
+        VPIntrinsic::GetVectorLengthParamPos(F.getIntrinsicID());
+    if (VecLenParamPos.hasValue()) {
+      Type *VecLenParamType = F.getArg(VecLenParamPos.getValue())->getType();
+      ASSERT_TRUE(VecLenParamType->isIntegerTy(32));
+    }
+  }
+}
+
+/// Check that going from Opcode to VP intrinsic and back results in the same
+/// Opcode.
+TEST_F(VPIntrinsicTest, OpcodeRoundTrip) {
+  std::vector<unsigned> Opcodes;
+  Opcodes.reserve(100);
+
+  {
+#define HANDLE_INST(OCNum, OCName, Class) Opcodes.push_back(OCNum);
+#include "llvm/IR/Instruction.def"
+  }
+
+  unsigned FullTripCounts = 0;
+  for (unsigned OC : Opcodes) {
+    Intrinsic::ID VPID = VPIntrinsic::GetForOpcode(OC);
+    // no equivalent VP intrinsic available
+    if (VPID == Intrinsic::not_intrinsic)
+      continue;
+
+    unsigned RoundTripOC = VPIntrinsic::GetFunctionalOpcodeForVP(VPID);
+    // no equivalent Opcode available
+    if (RoundTripOC == Instruction::Call)
+      continue;
+
+    ASSERT_EQ(RoundTripOC, OC);
+    ++FullTripCounts;
+  }
+  ASSERT_NE(FullTripCounts, 0u);
+}
+
+} // end anonymous namespace
--- a/utils/TableGen/CodeGenIntrinsics.h
+++ b/utils/TableGen/CodeGenIntrinsics.h
@ -123,6 +123,9 @@ struct CodeGenIntrinsic {
  /// True if the intrinsic is no-return.
  bool isNoReturn;

+  /// True if the intrinsic is no-sync.
+  bool isNoSync;
+
  /// True if the intrinsic is will-return.
  bool isWillReturn;

--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp
@ -607,6 +607,7 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
  isCommutative = false;
  canThrow = false;
  isNoReturn = false;
+  isNoSync = false;
  isWillReturn = false;
  isCold = false;
  isNoDuplicate = false;
@ -726,8 +727,7 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
      // variants with iAny types; otherwise, if the intrinsic is not
      // overloaded, all the types can be specified directly.
      assert(((!TyEl->isSubClassOf("LLVMExtendedType") &&
-               !TyEl->isSubClassOf("LLVMTruncatedType") &&
-               !TyEl->isSubClassOf("LLVMScalarOrSameVectorWidth")) ||
+               !TyEl->isSubClassOf("LLVMTruncatedType")) ||
              VT == MVT::iAny || VT == MVT::vAny) &&
             "Expected iAny or vAny type");
    } else
@ -772,6 +772,8 @@ CodeGenIntrinsic::CodeGenIntrinsic(Record *R) {
      isConvergent = true;
    else if (Property->getName() == "IntrNoReturn")
      isNoReturn = true;
+    else if (Property->getName() == "IntrNoSync")
+      isNoSync = true;
    else if (Property->getName() == "IntrWillReturn")
      isWillReturn = true;
    else if (Property->getName() == "IntrCold")
--- a/utils/TableGen/IntrinsicEmitter.cpp
+++ b/utils/TableGen/IntrinsicEmitter.cpp
@ -579,6 +579,9 @@ struct AttributeComparator {
    if (L->isNoReturn != R->isNoReturn)
      return R->isNoReturn;

+    if (L->isNoSync != R->isNoSync)
+      return R->isNoSync;
+
    if (L->isWillReturn != R->isWillReturn)
      return R->isWillReturn;

@ -720,8 +723,8 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,

    if (!intrinsic.canThrow ||
        (intrinsic.ModRef != CodeGenIntrinsic::ReadWriteMem && !intrinsic.hasSideEffects) ||
-        intrinsic.isNoReturn || intrinsic.isWillReturn || intrinsic.isCold ||
-        intrinsic.isNoDuplicate || intrinsic.isConvergent ||
+        intrinsic.isNoReturn || intrinsic.isNoSync || intrinsic.isWillReturn ||
+        intrinsic.isCold || intrinsic.isNoDuplicate || intrinsic.isConvergent ||
        intrinsic.isSpeculatable) {
      OS << "      const Attribute::AttrKind Atts[] = {";
      bool addComma = false;
@ -735,6 +738,12 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
        OS << "Attribute::NoReturn";
        addComma = true;
      }
+      if (intrinsic.isNoSync) {
+        if (addComma)
+          OS << ",";
+        OS << "Attribute::NoSync";
+        addComma = true;
+      }
      if (intrinsic.isWillReturn) {
        if (addComma)
          OS << ",";