Support arbitrary address space pointers in masked gather/scatter intrinsics.

Fixes PR31789 - When loop-vectorize tries to use these intrinsics for a non-default address space pointer we fail with a "Calling a function with a bad singature!" assertion. This patch solves this by adding the 'vector of pointers' argument as an overloaded type which will determine the address space. Differential revision: https://reviews.llvm.org/D31490 llvm-svn: 302018
2024-11-22 10:42:39 +01:00 · 2017-05-03 12:28:54 +00:00 · 2017-05-03 12:28:54 +00:00 · 3908e15a8b
commit 3908e15a8b
parent 56c6d8c476
19 changed files with 556 additions and 171 deletions
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@ -7915,7 +7915,7 @@ makes sense:
    ; get pointers for 8 elements from array B
    %ptrs = getelementptr double, double* %B, <8 x i32> %C
    ; load 8 elements from array B into A
-    %A = call <8 x double> @llvm.masked.gather.v8f64(<8 x double*> %ptrs,
+    %A = call <8 x double> @llvm.masked.gather.v8f64.v8p0f64(<8 x double*> %ptrs,
         i32 8, <8 x i1> %mask, <8 x double> %passthru)

 Conversion Operations
@ -12024,9 +12024,9 @@ This is an overloaded intrinsic. The loaded data are multiple scalar values of a

 ::

-      declare <16 x float> @llvm.masked.gather.v16f32   (<16 x float*> <ptrs>, i32 <alignment>, <16 x i1> <mask>, <16 x float> <passthru>)
-      declare <2 x double> @llvm.masked.gather.v2f64    (<2 x double*> <ptrs>, i32 <alignment>, <2 x i1>  <mask>, <2 x double> <passthru>)
-      declare <8 x float*> @llvm.masked.gather.v8p0f32  (<8 x float**> <ptrs>, i32 <alignment>, <8 x i1>  <mask>, <8 x float*> <passthru>)
+      declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32   (<16 x float*> <ptrs>, i32 <alignment>, <16 x i1> <mask>, <16 x float> <passthru>)
+      declare <2 x double> @llvm.masked.gather.v2f64.v2p1f64     (<2 x double addrspace(1)*> <ptrs>, i32 <alignment>, <2 x i1>  <mask>, <2 x double> <passthru>)
+      declare <8 x float*> @llvm.masked.gather.v8p0f32.v8p0p0f32 (<8 x float**> <ptrs>, i32 <alignment>, <8 x i1>  <mask>, <8 x float*> <passthru>)

 Overview:
 """""""""
@ -12049,7 +12049,7 @@ The semantics of this operation are equivalent to a sequence of conditional scal

 ::

-       %res = call <4 x double> @llvm.masked.gather.v4f64 (<4 x double*> %ptrs, i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)
+       %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64 (<4 x double*> %ptrs, i32 8, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x double> undef)

       ;; The gather with all-true mask is equivalent to the following instruction sequence
       %ptr0 = extractelement <4 x double*> %ptrs, i32 0
@ -12078,9 +12078,9 @@ This is an overloaded intrinsic. The data stored in memory is a vector of any in

 ::

-       declare void @llvm.masked.scatter.v8i32   (<8 x i32>     <value>, <8 x i32*>     <ptrs>, i32 <alignment>, <8 x i1>  <mask>)
-       declare void @llvm.masked.scatter.v16f32  (<16 x float>  <value>, <16 x float*>  <ptrs>, i32 <alignment>, <16 x i1> <mask>)
-       declare void @llvm.masked.scatter.v4p0f64 (<4 x double*> <value>, <4 x double**> <ptrs>, i32 <alignment>, <4 x i1>  <mask>)
+       declare void @llvm.masked.scatter.v8i32.v8p0i32     (<8 x i32>     <value>, <8 x i32*>     <ptrs>, i32 <alignment>, <8 x i1>  <mask>)
+       declare void @llvm.masked.scatter.v16f32.v16p1f32   (<16 x float>  <value>, <16 x float addrspace(1)*>  <ptrs>, i32 <alignment>, <16 x i1> <mask>)
+       declare void @llvm.masked.scatter.v4p0f64.v4p0p0f64 (<4 x double*> <value>, <4 x double**> <ptrs>, i32 <alignment>, <4 x i1>  <mask>)

 Overview:
 """""""""
@ -12101,7 +12101,7 @@ The '``llvm.masked.scatter``' intrinsics is designed for writing selected vector
 ::

       ;; This instruction unconditionally stores data vector in multiple addresses
-       call @llvm.masked.scatter.v8i32 (<8 x i32> %value, <8 x i32*> %ptrs, i32 4,  <8 x i1>  <true, true, .. true>)
+       call @llvm.masked.scatter.v8i32.v8p0i32 (<8 x i32> %value, <8 x i32*> %ptrs, i32 4,  <8 x i1>  <true, true, .. true>)

       ;; It is equivalent to a list of scalar stores
       %val0 = extractelement <8 x i32> %value, i32 0
--- a/include/llvm/IR/Intrinsics.h
+++ b/include/llvm/IR/Intrinsics.h
@ -100,7 +100,7 @@ namespace Intrinsic {
      Void, VarArg, MMX, Token, Metadata, Half, Float, Double,
      Integer, Vector, Pointer, Struct,
      Argument, ExtendArgument, TruncArgument, HalfVecArgument,
-      SameVecWidthArgument, PtrToArgument, PtrToElt, VecOfPtrsToElt
+      SameVecWidthArgument, PtrToArgument, PtrToElt, VecOfAnyPtrsToElt
    } Kind;

    union {
@ -119,25 +119,43 @@ namespace Intrinsic {
      AK_AnyVector,
      AK_AnyPointer
    };
+
    unsigned getArgumentNumber() const {
      assert(Kind == Argument || Kind == ExtendArgument ||
             Kind == TruncArgument || Kind == HalfVecArgument ||
             Kind == SameVecWidthArgument || Kind == PtrToArgument ||
-             Kind == PtrToElt || Kind == VecOfPtrsToElt);
+             Kind == PtrToElt);
      return Argument_Info >> 3;
    }
    ArgKind getArgumentKind() const {
      assert(Kind == Argument || Kind == ExtendArgument ||
             Kind == TruncArgument || Kind == HalfVecArgument ||
-             Kind == SameVecWidthArgument || Kind == PtrToArgument ||
-             Kind == VecOfPtrsToElt);
+             Kind == SameVecWidthArgument || Kind == PtrToArgument);
      return (ArgKind)(Argument_Info & 7);
    }

+    // VecOfAnyPtrsToElt uses both an overloaded argument (for address space)
+    // and a reference argument (for matching vector width and element types)
+    unsigned getOverloadArgNumber() const {
+      assert(Kind == VecOfAnyPtrsToElt);
+      return Argument_Info >> 16;
+    }
+    unsigned getRefArgNumber() const {
+      assert(Kind == VecOfAnyPtrsToElt);
+      return Argument_Info & 0xFFFF;
+    }
+
    static IITDescriptor get(IITDescriptorKind K, unsigned Field) {
      IITDescriptor Result = { K, { Field } };
      return Result;
    }
+
+    static IITDescriptor get(IITDescriptorKind K, unsigned short Hi,
+                             unsigned short Lo) {
+      unsigned Field = Hi << 16 | Lo;
+      IITDescriptor Result = {K, {Field}};
+      return Result;
+    }
  };

  /// Return the IIT table descriptor for the specified intrinsic into an array
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@ -155,7 +155,7 @@ class LLVMVectorSameWidth<int num, LLVMType elty>
 }
 class LLVMPointerTo<int num> : LLVMMatchType<num>;
 class LLVMPointerToElt<int num> : LLVMMatchType<num>;
-class LLVMVectorOfPointersToElt<int num> : LLVMMatchType<num>;
+class LLVMVectorOfAnyPointersToElt<int num> : LLVMMatchType<num>;

 // Match the type of another intrinsic parameter that is expected to be a
 // vector type, but change the element count to be half as many
@ -761,14 +761,14 @@ def int_masked_load  : Intrinsic<[llvm_anyvector_ty],
                                 [IntrReadMem, IntrArgMemOnly]>;

 def int_masked_gather: Intrinsic<[llvm_anyvector_ty],
-                                 [LLVMVectorOfPointersToElt<0>, llvm_i32_ty,
+                                 [LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty,
                                  LLVMVectorSameWidth<0, llvm_i1_ty>,
                                  LLVMMatchType<0>],
                                 [IntrReadMem]>;

 def int_masked_scatter: Intrinsic<[],
                                  [llvm_anyvector_ty,
-                                   LLVMVectorOfPointersToElt<0>, llvm_i32_ty,
+                                   LLVMVectorOfAnyPointersToElt<0>, llvm_i32_ty,
                                   LLVMVectorSameWidth<0, llvm_i1_ty>]>;

 def int_masked_expandload: Intrinsic<[llvm_anyvector_ty],
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@ -467,6 +467,27 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
        return true;
      }
    }
+    // Renaming gather/scatter intrinsics with no address space overloading
+    // to the new overload which includes an address space
+    if (Name.startswith("masked.gather.")) {
+      Type *Tys[] = {F->getReturnType(), F->arg_begin()->getType()};
+      if (F->getName() != Intrinsic::getName(Intrinsic::masked_gather, Tys)) {
+        rename(F);
+        NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                          Intrinsic::masked_gather, Tys);
+        return true;
+      }
+    }
+    if (Name.startswith("masked.scatter.")) {
+      auto Args = F->getFunctionType()->params();
+      Type *Tys[] = {Args[0], Args[1]};
+      if (F->getName() != Intrinsic::getName(Intrinsic::masked_scatter, Tys)) {
+        rename(F);
+        NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                          Intrinsic::masked_scatter, Tys);
+        return true;
+      }
+    }
    break;
  }
  case 'n': {
@ -2072,7 +2093,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
  case Intrinsic::invariant_start:
  case Intrinsic::invariant_end:
  case Intrinsic::masked_load:
-  case Intrinsic::masked_store: {
+  case Intrinsic::masked_store:
+  case Intrinsic::masked_gather:
+  case Intrinsic::masked_scatter: {
    SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
                                 CI->arg_operands().end());
    NewCall = Builder.CreateCall(NewFn, Args);
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@ -574,13 +574,12 @@ enum IIT_Info {
  IIT_SAME_VEC_WIDTH_ARG = 31,
  IIT_PTR_TO_ARG = 32,
  IIT_PTR_TO_ELT = 33,
-  IIT_VEC_OF_PTRS_TO_ELT = 34,
+  IIT_VEC_OF_ANYPTRS_TO_ELT = 34,
  IIT_I128 = 35,
  IIT_V512 = 36,
  IIT_V1024 = 37
 };

-
 static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
                      SmallVectorImpl<Intrinsic::IITDescriptor> &OutputTable) {
  IIT_Info Info = IIT_Info(Infos[NextElt++]);
@ -716,10 +715,11 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
    OutputTable.push_back(IITDescriptor::get(IITDescriptor::PtrToElt, ArgInfo));
    return;
  }
-  case IIT_VEC_OF_PTRS_TO_ELT: {
-    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::VecOfPtrsToElt,
-                                             ArgInfo));
+  case IIT_VEC_OF_ANYPTRS_TO_ELT: {
+    unsigned short ArgNo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    unsigned short RefNo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(
+        IITDescriptor::get(IITDescriptor::VecOfAnyPtrsToElt, ArgNo, RefNo));
    return;
  }
  case IIT_EMPTYSTRUCT:
@ -808,7 +808,6 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
      Elts[i] = DecodeFixedType(Infos, Tys, Context);
    return StructType::get(Context, makeArrayRef(Elts,D.Struct_NumElements));
  }
-
  case IITDescriptor::Argument:
    return Tys[D.getArgumentNumber()];
  case IITDescriptor::ExtendArgument: {
@ -850,15 +849,9 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
    Type *EltTy = VTy->getVectorElementType();
    return PointerType::getUnqual(EltTy);
  }
-  case IITDescriptor::VecOfPtrsToElt: {
-    Type *Ty = Tys[D.getArgumentNumber()];
-    VectorType *VTy = dyn_cast<VectorType>(Ty);
-    if (!VTy)
-      llvm_unreachable("Expected an argument of Vector Type");
-    Type *EltTy = VTy->getVectorElementType();
-    return VectorType::get(PointerType::getUnqual(EltTy),
-                           VTy->getNumElements());
-  }
+  case IITDescriptor::VecOfAnyPtrsToElt:
+    // Return the overloaded type (which determines the pointers address space)
+    return Tys[D.getOverloadArgNumber()];
 }
  llvm_unreachable("unhandled");
 }
@ -1054,11 +1047,22 @@ bool Intrinsic::matchIntrinsicType(Type *Ty, ArrayRef<Intrinsic::IITDescriptor>
      return (!ThisArgType || !ReferenceType ||
              ThisArgType->getElementType() != ReferenceType->getElementType());
    }
-    case IITDescriptor::VecOfPtrsToElt: {
-      if (D.getArgumentNumber() >= ArgTys.size())
+    case IITDescriptor::VecOfAnyPtrsToElt: {
+      unsigned RefArgNumber = D.getRefArgNumber();
+
+      // This may only be used when referring to a previous argument.
+      if (RefArgNumber >= ArgTys.size())
        return true;
-      VectorType * ReferenceType =
-              dyn_cast<VectorType> (ArgTys[D.getArgumentNumber()]);
+
+      // Record the overloaded type
+      assert(D.getOverloadArgNumber() == ArgTys.size() &&
+             "Table consistency error");
+      ArgTys.push_back(Ty);
+
+      // Verify the overloaded type "matches" the Ref type.
+      // i.e. Ty is a vector with the same width as Ref.
+      // Composed of pointers to the same element type as Ref.
+      VectorType *ReferenceType = dyn_cast<VectorType>(ArgTys[RefArgNumber]);
      VectorType *ThisArgVecTy = dyn_cast<VectorType>(Ty);
      if (!ThisArgVecTy || !ReferenceType ||
          (ReferenceType->getVectorNumElements() !=
--- a/lib/IR/IRBuilder.cpp
+++ b/lib/IR/IRBuilder.cpp
@ -293,11 +293,13 @@ CallInst *IRBuilderBase::CreateMaskedGather(Value *Ptrs, unsigned Align,
    Mask = Constant::getAllOnesValue(VectorType::get(Type::getInt1Ty(Context),
                                     NumElts));

+  Type *OverloadedTypes[] = {DataTy, PtrsTy};
  Value * Ops[] = {Ptrs, getInt32(Align), Mask, UndefValue::get(DataTy)};

  // We specify only one type when we create this intrinsic. Types of other
  // arguments are derived from this type.
-  return CreateMaskedIntrinsic(Intrinsic::masked_gather, Ops, { DataTy }, Name);
+  return CreateMaskedIntrinsic(Intrinsic::masked_gather, Ops, OverloadedTypes,
+                               Name);
 }

 /// \brief Create a call to a Masked Scatter intrinsic.
@ -323,11 +325,13 @@ CallInst *IRBuilderBase::CreateMaskedScatter(Value *Data, Value *Ptrs,
  if (!Mask)
    Mask = Constant::getAllOnesValue(VectorType::get(Type::getInt1Ty(Context),
                                     NumElts));
+
+  Type *OverloadedTypes[] = {DataTy, PtrsTy};
  Value * Ops[] = {Data, Ptrs, getInt32(Align), Mask};

  // We specify only one type when we create this intrinsic. Types of other
  // arguments are derived from this type.
-  return CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, { DataTy });
+  return CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, OverloadedTypes);
 }

 template <typename T0, typename T1, typename T2, typename T3>
--- a/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
+++ b/test/Analysis/CostModel/X86/masked-intrinsic-cost.ll
@ -78,10 +78,10 @@ define <2 x double> @test_gather_2f64(<2 x double*> %ptrs, <2 x i1> %mask, <2 x
 ; SKX-LABEL: test_gather_2f64
 ; SKX: Found an estimated cost of 7 {{.*}}.gather

-%res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
+%res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 4, <2 x i1> %mask, <2 x double> %src0)
  ret <2 x double> %res
 }
-declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %src0)
+declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %src0)

 define <4 x i32> @test_gather_4i32(<4 x i32*> %ptrs, <4 x i1> %mask, <4 x i32> %src0)  {

@ -94,7 +94,7 @@ define <4 x i32> @test_gather_4i32(<4 x i32*> %ptrs, <4 x i1> %mask, <4 x i32> %
 ; SKX-LABEL: test_gather_4i32
 ; SKX: Found an estimated cost of 6 {{.*}}.gather

-%res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
+%res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> %src0)
  ret <4 x i32> %res
 }

@ -109,10 +109,10 @@ define <4 x i32> @test_gather_4i32_const_mask(<4 x i32*> %ptrs, <4 x i32> %src0)
 ; SKX-LABEL: test_gather_4i32_const_mask
 ; SKX: Found an estimated cost of 6 {{.*}}.gather

-%res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
+%res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> %src0)
  ret <4 x i32> %res
 }
-declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32, <4 x i1> %mask, <4 x i32> %src0)
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32, <4 x i1> %mask, <4 x i32> %src0)

 define <16 x float> @test_gather_16f32_const_mask(float* %base, <16 x i32> %ind) {

@ -128,7 +128,7 @@ define <16 x float> @test_gather_16f32_const_mask(float* %base, <16 x i32> %ind)
  %sext_ind = sext <16 x i32> %ind to <16 x i64>
  %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind

-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
  ret <16 x float>%res
 }

@ -146,7 +146,7 @@ define <16 x float> @test_gather_16f32_var_mask(float* %base, <16 x i32> %ind, <
  %sext_ind = sext <16 x i32> %ind to <16 x i64>
  %gep.v = getelementptr float, float* %base, <16 x i64> %sext_ind

-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
  ret <16 x float>%res
 }

@ -164,7 +164,7 @@ define <16 x float> @test_gather_16f32_ra_var_mask(<16 x float*> %ptrs, <16 x i3
  %sext_ind = sext <16 x i32> %ind to <16 x i64>
  %gep.v = getelementptr float, <16 x float*> %ptrs, <16 x i64> %sext_ind

-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32 4, <16 x i1> %mask, <16 x float> undef)
  ret <16 x float>%res
 }

@ -185,7 +185,7 @@ define <16 x float> @test_gather_16f32_const_mask2(float* %base, <16 x i32> %ind
  %sext_ind = sext <16 x i32> %ind to <16 x i64>
  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind

-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
  ret <16 x float>%res
 }

@ -204,7 +204,7 @@ define void @test_scatter_16i32(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i3

  %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
  %imask = bitcast i16 %mask to <16 x i1>
-  call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
+  call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
  ret void
 }

@ -218,11 +218,11 @@ define void @test_scatter_8i32(<8 x i32>%a1, <8 x i32*> %ptr, <8 x i1>%mask) {
 ; SKX-LABEL: test_scatter_8i32
 ; SKX: Found an estimated cost of 10 {{.*}}.scatter

-  call void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> %mask)
  ret void
 }

-declare void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32, <8 x i1> %mask)
+declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32, <8 x i1> %mask)

 define void @test_scatter_4i32(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
 ; AVX2-LABEL: test_scatter_4i32
@ -234,7 +234,7 @@ define void @test_scatter_4i32(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
 ; SKX-LABEL: test_scatter_4i32
 ; SKX: Found an estimated cost of 6 {{.*}}.scatter

-  call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
+  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
  ret void
 }

@ -252,7 +252,7 @@ define <4 x float> @test_gather_4f32(float* %ptr, <4 x i32> %ind, <4 x i1>%mask)
  %sext_ind = sext <4 x i32> %ind to <4 x i64>
  %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind

-  %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> %mask, <4 x float> undef)
  ret <4 x float>%res
 }

@ -270,14 +270,14 @@ define <4 x float> @test_gather_4f32_const_mask(float* %ptr, <4 x i32> %ind) {
  %sext_ind = sext <4 x i32> %ind to <4 x i64>
  %gep.v = getelementptr float, float* %ptr, <4 x i64> %sext_ind

-  %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> undef)
  ret <4 x float>%res
 }

-declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.v, i32, <4 x i1> %mask, <4 x float> )
-declare void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32, <4 x i1> %mask)
-declare void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32, <16 x i1> %imask)
-declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.v, i32, <16 x i1> %mask, <16 x float>)
+declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.v, i32, <4 x i1> %mask, <4 x float> )
+declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32, <4 x i1> %mask)
+declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32, <16 x i1> %imask)
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.v, i32, <16 x i1> %mask, <16 x float>)

 declare <16 x i32> @llvm.masked.load.v16i32.p0v16i32(<16 x i32>*, i32, <16 x i1>, <16 x i32>)
 declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>)
--- a/test/Analysis/CostModel/X86/vector_gep.ll
+++ b/test/Analysis/CostModel/X86/vector_gep.ll
@ -3,7 +3,7 @@
 %struct.S = type { [1000 x i32] }


-declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)

 define <4 x i32> @foov(<4 x %struct.S*> %s, i64 %base){
  %temp = insertelement <4 x i64> undef, i64 %base, i32 0
@ -12,6 +12,6 @@ define <4 x i32> @foov(<4 x %struct.S*> %s, i64 %base){
  %B = getelementptr inbounds %struct.S, <4 x %struct.S*> %s, <4 x i32> zeroinitializer, <4 x i32> zeroinitializer
 ;CHECK: cost of 0 for instruction: {{.*}} getelementptr inbounds [1000 x i32]
  %arrayidx = getelementptr inbounds [1000 x i32], <4 x [1000 x i32]*> %B, <4 x i64> zeroinitializer, <4 x i64> %vector
-  %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %arrayidx, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %arrayidx, i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> undef)
  ret <4 x i32> %res
 }
--- a/test/Assembler/auto_upgrade_intrinsics.ll
+++ b/test/Assembler/auto_upgrade_intrinsics.ll
@ -85,6 +85,23 @@ define void @tests.masked.store(<2 x double>* %ptr, <2 x i1> %mask, <2 x double>
  ret void
 }

+declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %src0)
+
+define <2 x double> @tests.masked.gather(<2 x double*> %ptr, <2 x i1> %mask, <2 x double> %passthru)  {
+; CHECK-LABEL: @tests.masked.gather(
+; CHECK: @llvm.masked.gather.v2f64.v2p0f64
+  %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptr, i32 1, <2 x i1> %mask, <2 x double> %passthru)
+  ret <2 x double> %res
+}
+
+declare void @llvm.masked.scatter.v2f64(<2 x double> %val, <2 x double*> %ptrs, i32, <2 x i1> %mask)
+
+define void @tests.masked.scatter(<2 x double*> %ptr, <2 x i1> %mask, <2 x double> %val)  {
+; CHECK-LABEL: @tests.masked.scatter(
+; CHECK: @llvm.masked.scatter.v2f64.v2p0f64
+  call void @llvm.masked.scatter.v2f64(<2 x double> %val, <2 x double*> %ptr, i32 3, <2 x i1> %mask)
+  ret void
+}

 declare {}* @llvm.invariant.start(i64, i8* nocapture) nounwind readonly
 declare void @llvm.invariant.end({}*, i64, i8* nocapture) nounwind
--- a/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/test/CodeGen/X86/masked_gather_scatter.ll
@ -54,13 +54,13 @@ define <16 x float> @test1(float* %base, <16 x i32> %ind) {
  %sext_ind = sext <16 x i32> %ind to <16 x i64>
  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind

-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
  ret <16 x float>%res
 }

-declare <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
-declare <16 x float> @llvm.masked.gather.v16f32(<16 x float*>, i32, <16 x i1>, <16 x float>)
-declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> )
+declare <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>)
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*>, i32, <16 x i1>, <16 x float>)
+declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> , i32, <8 x i1> , <8 x i32> )


 ; SCALAR-LABEL: test2
@ -111,7 +111,7 @@ define <16 x float> @test2(float* %base, <16 x i32> %ind, i16 %mask) {
  %sext_ind = sext <16 x i32> %ind to <16 x i64>
  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind
  %imask = bitcast i16 %mask to <16 x i1>
-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> %imask, <16 x float>undef)
  ret <16 x float> %res
 }

@ -152,7 +152,7 @@ define <16 x i32> @test3(i32* %base, <16 x i32> %ind, i16 %mask) {
  %sext_ind = sext <16 x i32> %ind to <16 x i64>
  %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i64> %sext_ind
  %imask = bitcast i16 %mask to <16 x i1>
-  %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
+  %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
  ret <16 x i32> %res
 }

@ -205,8 +205,8 @@ define <16 x i32> @test4(i32* %base, <16 x i32> %ind, i16 %mask) {

  %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
  %imask = bitcast i16 %mask to <16 x i1>
-  %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
-  %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
+  %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
+  %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %gep.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
  %res = add <16 x i32> %gt1, %gt2
  ret <16 x i32> %res
 }
@ -270,13 +270,13 @@ define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {

  %gep.random = getelementptr i32, <16 x i32*> %broadcast.splat, <16 x i32> %ind
  %imask = bitcast i16 %mask to <16 x i1>
-  call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
-  call void @llvm.masked.scatter.v16i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
+  call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
+  call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32>%val, <16 x i32*> %gep.random, i32 4, <16 x i1> %imask)
  ret void
 }

-declare void @llvm.masked.scatter.v8i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> )
-declare void @llvm.masked.scatter.v16i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> )
+declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> , <8 x i32*> , i32 , <8 x i1> )
+declare void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> , <16 x i32*> , i32 , <16 x i1> )


 ; SCALAR-LABEL: test6
@ -326,9 +326,9 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x i32*> %ptr) {
 ; SKX_32-NEXT:    vmovdqa %ymm2, %ymm0
 ; SKX_32-NEXT:    retl

-  %a = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %a = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)

-  call void @llvm.masked.scatter.v8i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  call void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32> %a1, <8 x i32*> %ptr, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
  ret <8 x i32>%a
 }

@ -384,8 +384,8 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {

  %gep.random = getelementptr i32, <8 x i32*> %broadcast.splat, <8 x i32> %ind
  %imask = bitcast i8 %mask to <8 x i1>
-  %gt1 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef)
-  %gt2 = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1)
+  %gt1 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>undef)
+  %gt2 = call <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*> %gep.random, i32 4, <8 x i1> %imask, <8 x i32>%gt1)
  %res = add <8 x i32> %gt1, %gt2
  ret <8 x i32> %res
 }
@ -444,8 +444,8 @@ define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
 ; SKX_32-NEXT:    retl

  %imask = bitcast i16 %mask to <16 x i1>
-  %gt1 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
-  %gt2 = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
+  %gt1 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>undef)
+  %gt2 = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptr.random, i32 4, <16 x i1> %imask, <16 x i32>%gt1)
  %res = add <16 x i32> %gt1, %gt2
  ret <16 x i32> %res
 }
@ -522,7 +522,7 @@ entry:
  %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer

  %arrayidx = getelementptr  %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %ind1, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>, <8 x i32><i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, <8 x i32> %ind5, <8 x i64> <i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13, i64 13>
-  %res = call <8 x i32 >  @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %res = call <8 x i32 >  @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
  ret <8 x i32> %res
 }

@ -591,7 +591,7 @@ entry:
  %broadcast.splat = shufflevector <8 x %struct.ST*> %broadcast.splatinsert, <8 x %struct.ST*> undef, <8 x i32> zeroinitializer

  %arrayidx = getelementptr  %struct.ST, <8 x %struct.ST*> %broadcast.splat, <8 x i64> %i1, i32 2, i32 1, <8 x i32> %ind5, i64 13
-  %res = call <8 x i32 >  @llvm.masked.gather.v8i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
+  %res = call <8 x i32 >  @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>%arrayidx, i32 4, <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i32> undef)
  ret <8 x i32> %res
 }

@ -632,7 +632,7 @@ define <16 x float> @test11(float* %base, i32 %ind) {

  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind

-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
  ret <16 x float>%res
 }

@ -671,7 +671,7 @@ define <16 x float> @test12(float* %base, <16 x i32> %ind) {
  %sext_ind = sext <16 x i32> %ind to <16 x i64>
  %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind

-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
  ret <16 x float>%res
 }

@ -710,7 +710,7 @@ define <16 x float> @test13(float* %base, <16 x i32> %ind) {
  %sext_ind = sext <16 x i32> %ind to <16 x i64>
  %gep.random = getelementptr float, float *%base, <16 x i64> %sext_ind

-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
  ret <16 x float>%res
 }

@ -772,13 +772,13 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {

  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, i32 %ind

-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
  ret <16 x float>%res
 }

-declare <4 x float> @llvm.masked.gather.v4f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
-declare <4 x double> @llvm.masked.gather.v4f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
-declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*>, i32, <2 x i1>, <2 x double>)
+declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>)
+declare <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*>, i32, <4 x i1>, <4 x double>)
+declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*>, i32, <2 x i1>, <2 x double>)

 ; Gather smaller than existing instruction
 define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
@ -831,7 +831,7 @@ define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {

  %sext_ind = sext <4 x i32> %ind to <4 x i64>
  %gep.random = getelementptr float, float* %base, <4 x i64> %sext_ind
-  %res = call <4 x float> @llvm.masked.gather.v4f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
+  %res = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gep.random, i32 4, <4 x i1> %mask, <4 x float> undef)
  ret <4 x float>%res
 }

@ -890,7 +890,7 @@ define <4 x double> @test16(double* %base, <4 x i32> %ind, <4 x i1> %mask, <4 x

  %sext_ind = sext <4 x i32> %ind to <4 x i64>
  %gep.random = getelementptr double, double* %base, <4 x i64> %sext_ind
-  %res = call <4 x double> @llvm.masked.gather.v4f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
+  %res = call <4 x double> @llvm.masked.gather.v4f64.v4p0f64(<4 x double*> %gep.random, i32 4, <4 x i1> %mask, <4 x double> %src0)
  ret <4 x double>%res
 }

@ -942,15 +942,15 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x

  %sext_ind = sext <2 x i32> %ind to <2 x i64>
  %gep.random = getelementptr double, double* %base, <2 x i64> %sext_ind
-  %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %gep.random, i32 4, <2 x i1> %mask, <2 x double> %src0)
  ret <2 x double>%res
 }

-declare void @llvm.masked.scatter.v4i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> )
-declare void @llvm.masked.scatter.v4f64(<4 x double> , <4 x double*> , i32 , <4 x i1> )
-declare void @llvm.masked.scatter.v2i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> )
-declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
-declare void @llvm.masked.scatter.v2f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )
+declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> , <4 x i32*> , i32 , <4 x i1> )
+declare void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> , <4 x double*> , i32 , <4 x i1> )
+declare void @llvm.masked.scatter.v2i64.v2p0i64(<2 x i64> , <2 x i64*> , i32 , <2 x i1> )
+declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
+declare void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> , <2 x float*> , i32 , <2 x i1> )

 define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
 ;
@ -995,7 +995,7 @@ define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
 ; SKX_32-NEXT:    vptestmd %xmm2, %xmm2, %k1
 ; SKX_32-NEXT:    vpscatterdd %xmm0, (,%xmm1) {%k1}
 ; SKX_32-NEXT:    retl
-  call void @llvm.masked.scatter.v4i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
+  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %a1, <4 x i32*> %ptr, i32 4, <4 x i1> %mask)
  ret void
 }

@ -1049,7 +1049,7 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind
 ; SKX_32-NEXT:    vzeroupper
 ; SKX_32-NEXT:    retl
  %gep = getelementptr double, double* %ptr, <4 x i64> %ind
-  call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
+  call void @llvm.masked.scatter.v4f64.v4p0f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
  ret void
 }

@ -1103,7 +1103,7 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
 ; SKX_32-NEXT:    kshiftrb $6, %k0, %k1
 ; SKX_32-NEXT:    vscatterdps %xmm0, (,%xmm1) {%k1}
 ; SKX_32-NEXT:    retl
-  call void @llvm.masked.scatter.v2f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
+  call void @llvm.masked.scatter.v2f32.v2p0f32(<2 x float> %a1, <2 x float*> %ptr, i32 4, <2 x i1> %mask)
  ret void
 }

@ -1157,12 +1157,12 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
 ; SKX_32-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
 ; SKX_32-NEXT:    vzeroupper
 ; SKX_32-NEXT:    retl
-  call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
+  call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
  ret void
 }

 ; The result type requires widening
-declare <2 x float> @llvm.masked.gather.v2f32(<2 x float*>, i32, <2 x i1>, <2 x float>)
+declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1>, <2 x float>)

 define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float> %src0) {
 ;
@ -1222,12 +1222,12 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl
 ; SKX_32-NEXT:    retl
  %sext_ind = sext <2 x i32> %ind to <2 x i64>
  %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
-  %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
+  %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> %mask, <2 x float> %src0)
  ret <2 x float>%res
 }

-declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
-declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
+declare <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>)

 define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %src0) {
 ;
@ -1276,7 +1276,7 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %
 ; SKX_32-NEXT:    retl
  %sext_ind = sext <2 x i32> %ind to <2 x i64>
  %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
-  %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
+  %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> %mask, <2 x i32> %src0)
  ret <2 x i32>%res
 }

@ -1320,7 +1320,7 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
 ; SKX_32-NEXT:    retl
  %sext_ind = sext <2 x i32> %ind to <2 x i64>
  %gep.random = getelementptr i32, i32* %base, <2 x i64> %sext_ind
-  %res = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
+  %res = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x i32> undef)
  ret <2 x i32>%res
 }

@ -1371,7 +1371,7 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %
 ; SKX_32-NEXT:    retl
  %sext_ind = sext <2 x i32> %ind to <2 x i64>
  %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
-  %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
+  %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep.random, i32 8, <2 x i1> %mask, <2 x i64> %src0)
  ret <2 x i64>%res
 }

@ -1418,7 +1418,7 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
 ; SKX_32-NEXT:    retl
  %sext_ind = sext <2 x i32> %ind to <2 x i64>
  %gep.random = getelementptr i64, i64* %base, <2 x i64> %sext_ind
-  %res = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
+  %res = call <2 x i64> @llvm.masked.gather.v2i64.v2p0i64(<2 x i64*> %gep.random, i32 8, <2 x i1> <i1 true, i1 true>, <2 x i64> %src0)
  ret <2 x i64>%res
 }

@ -1466,7 +1466,7 @@ define <2 x float> @test27(float* %base, <2 x i32> %ind) {
 ; SKX_32-NEXT:    retl
  %sext_ind = sext <2 x i32> %ind to <2 x i64>
  %gep.random = getelementptr float, float* %base, <2 x i64> %sext_ind
-  %res = call <2 x float> @llvm.masked.gather.v2f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
+  %res = call <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*> %gep.random, i32 4, <2 x i1> <i1 true, i1 true>, <2 x float> undef)
  ret <2 x float>%res
 }

@ -1515,7 +1515,7 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
 ; SKX_32-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
 ; SKX_32-NEXT:    vzeroupper
 ; SKX_32-NEXT:    retl
-  call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
+  call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
  ret void
 }

@ -1568,23 +1568,23 @@ define <16 x float> @test29(float* %base, <16 x i32> %ind) {
  %sext_ind = sext <16 x i32> %ind to <16 x i64>
  %gep.random = getelementptr float, <16 x float*> %broadcast.splat, <16 x i64> %sext_ind

-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %gep.random, i32 4, <16 x i1> <i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <16 x float> undef)
  ret <16 x float>%res
 }

 ; Check non-power-of-2 case. It should be scalarized.
-declare <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
+declare <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*>, i32, <3 x i1>, <3 x i32>)
 define <3 x i32> @test30(<3 x i32*> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32> %src0) {
 ; ALL-LABEL: test30:
 ; ALL-NOT:       gather

  %sext_ind = sext <3 x i32> %ind to <3 x i64>
  %gep.random = getelementptr i32, <3 x i32*> %base, <3 x i64> %sext_ind
-  %res = call <3 x i32> @llvm.masked.gather.v3i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
+  %res = call <3 x i32> @llvm.masked.gather.v3i32.v3p0i32(<3 x i32*> %gep.random, i32 4, <3 x i1> %mask, <3 x i32> %src0)
  ret <3 x i32>%res
 }

-declare <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)
+declare <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**>, i32, <16 x i1>, <16 x float*>)

 ; KNL-LABEL: test31
 ; KNL: vpgatherqq
@ -1626,7 +1626,7 @@ define <16 x float*> @test31(<16 x float**> %ptrs) {
 ; SKX_32-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; SKX_32-NEXT:    retl

-  %res = call <16 x float*> @llvm.masked.gather.v16p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef)
+  %res = call <16 x float*> @llvm.masked.gather.v16p0f32.v16p0p0f32(<16 x float**> %ptrs, i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <16 x float*> undef)
  ret <16 x float*>%res
 }

@ -1672,7 +1672,7 @@ define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i
 ; SKX_32-NEXT:    vpgatherdd (,%zmm0), %zmm2 {%k1}
 ; SKX_32-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; SKX_32-NEXT:    retl
-  %res = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
+  %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <16 x i32> %src0)
  ret <16 x i32> %res
 }
 define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
@ -1749,10 +1749,10 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
 ; SKX_32-NEXT:    movl %ebp, %esp
 ; SKX_32-NEXT:    popl %ebp
 ; SKX_32-NEXT:    retl
-  %res = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
+  %res = call <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32 4, <16 x i1> %mask, <16 x i64> %src0)
  ret <16 x i64> %res
 }
-declare <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
+declare <16 x i64> @llvm.masked.gather.v16i64.v16p0i64(<16 x i64*> %ptrs, i32, <16 x i1> %mask, <16 x i64> %src0)
 define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0)  {
 ; KNL_64-LABEL: test_gather_16f32:
 ; KNL_64:       # BB#0:
@ -1795,7 +1795,7 @@ define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16
 ; SKX_32-NEXT:    vgatherdps (,%zmm0), %zmm2 {%k1}
 ; SKX_32-NEXT:    vmovaps %zmm2, %zmm0
 ; SKX_32-NEXT:    retl
-  %res = call <16 x float> @llvm.masked.gather.v16f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %ptrs, i32 4, <16 x i1> %mask, <16 x float> %src0)
  ret <16 x float> %res
 }
 define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
@ -1872,10 +1872,10 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
 ; SKX_32-NEXT:    movl %ebp, %esp
 ; SKX_32-NEXT:    popl %ebp
 ; SKX_32-NEXT:    retl
-  %res = call <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
+  %res = call <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32 4, <16 x i1> %mask, <16 x double> %src0)
  ret <16 x double> %res
 }
-declare <16 x double> @llvm.masked.gather.v16f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
+declare <16 x double> @llvm.masked.gather.v16f64.v16p0f64(<16 x double*> %ptrs, i32, <16 x i1> %mask, <16 x double> %src0)
 define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %src0)  {
 ; KNL_64-LABEL: test_scatter_16i32:
 ; KNL_64:       # BB#0:
@ -1918,7 +1918,7 @@ define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %
 ; SKX_32-NEXT:    vpscatterdd %zmm2, (,%zmm0) {%k1}
 ; SKX_32-NEXT:    vzeroupper
 ; SKX_32-NEXT:    retl
-  call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
+  call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
  ret void
 }
 define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %src0)  {
@ -1993,10 +1993,10 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
 ; SKX_32-NEXT:    popl %ebp
 ; SKX_32-NEXT:    vzeroupper
 ; SKX_32-NEXT:    retl
-  call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
+  call void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
  ret void
 }
-declare void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
+declare void @llvm.masked.scatter.v16i64.v16p0i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32, <16 x i1> %mask)
 define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x float> %src0)  {
 ; KNL_64-LABEL: test_scatter_16f32:
 ; KNL_64:       # BB#0:
@ -2039,10 +2039,10 @@ define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x floa
 ; SKX_32-NEXT:    vscatterdps %zmm2, (,%zmm0) {%k1}
 ; SKX_32-NEXT:    vzeroupper
 ; SKX_32-NEXT:    retl
-  call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
+  call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
  ret void
 }
-declare void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
+declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %src0, <16 x float*> %ptrs, i32, <16 x i1> %mask)
 define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x double> %src0)  {
 ; KNL_64-LABEL: test_scatter_16f64:
 ; KNL_64:       # BB#0:
@ -2115,10 +2115,10 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
 ; SKX_32-NEXT:    popl %ebp
 ; SKX_32-NEXT:    vzeroupper
 ; SKX_32-NEXT:    retl
-  call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
+  call void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
  ret void
 }
-declare void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)
+declare void @llvm.masked.scatter.v16f64.v16p0f64(<16 x double> %src0, <16 x double*> %ptrs, i32, <16 x i1> %mask)

 define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i64> %d) {
 ; KNL_64-LABEL: test_pr28312:
@ -2193,11 +2193,11 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
 ; SKX_32-NEXT:    movl %ebp, %esp
 ; SKX_32-NEXT:    popl %ebp
 ; SKX_32-NEXT:    retl
-  %g1 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
-  %g2 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
-  %g3 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
+  %g1 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
+  %g2 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
+  %g3 = call <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
  %a = add <4 x i64> %g1, %g2
  %b = add <4 x i64> %a, %g3
  ret <4 x i64> %b
 }
-declare <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*>, i32, <4 x i1>, <4 x i64>)
+declare <4 x i64> @llvm.masked.gather.v4i64.v4p0i64(<4 x i64*>, i32, <4 x i1>, <4 x i64>)
--- a/test/Transforms/FunctionAttrs/readattrs.ll
+++ b/test/Transforms/FunctionAttrs/readattrs.ll
@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -functionattrs -S | FileCheck %s
 ; RUN: opt < %s -aa-pipeline=basic-aa -passes='cgscc(function-attrs)' -S | FileCheck %s
@x = global i32 0
@ -68,22 +69,22 @@ entry:
 }

 ; CHECK: declare void @llvm.masked.scatter
-declare void @llvm.masked.scatter.v4i32(<4 x i32>%val, <4 x i32*>, i32, <4 x i1>)
+declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>%val, <4 x i32*>, i32, <4 x i1>)

 ; CHECK-NOT: readnone
 ; CHECK-NOT: readonly
 ; CHECK: define void @test9
 define void @test9(<4 x i32*> %ptrs, <4 x i32>%val) {
-  call void @llvm.masked.scatter.v4i32(<4 x i32>%val, <4 x i32*> %ptrs, i32 4, <4 x i1><i1 true, i1 false, i1 true, i1 false>)
+  call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>%val, <4 x i32*> %ptrs, i32 4, <4 x i1><i1 true, i1 false, i1 true, i1 false>)
  ret void
 }

 ; CHECK: declare <4 x i32> @llvm.masked.gather
-declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
+declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>)
 ; CHECK: readonly
 ; CHECK: define <4 x i32> @test10
 define <4 x i32> @test10(<4 x i32*> %ptrs) {
-  %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1><i1 true, i1 false, i1 true, i1 false>, <4 x i32>undef)
+  %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs, i32 4, <4 x i1><i1 true, i1 false, i1 true, i1 false>, <4 x i32>undef)
  ret <4 x i32> %res
 }

--- a/test/Transforms/GVN/2016-08-30-MaskedScatterGather.ll
+++ b/test/Transforms/GVN/2016-08-30-MaskedScatterGather.ll
@ -1,7 +1,7 @@
 ; RUN: opt < %s -basicaa -gvn -S | FileCheck %s

-declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
-declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
+declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)

 ; This test ensures that masked scatter and gather operations, which take vectors of pointers,
 ; do not have pointer aliasing ignored when being processed.
@ -20,18 +20,18 @@ entry:
  %tmp.i = insertelement <2 x i32*> undef, i32* %tmp.0, i32 0
  %tmp = insertelement <2 x i32*> %tmp.i, i32* %tmp.1, i32 1
  ; Read from in1 and in2
-  %in1.v = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %in1, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
-  %in2.v = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %in2, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  %in1.v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %in1, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  %in2.v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %in2, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
  ; Store in1 to the allocas
-  call void @llvm.masked.scatter.v2i32(<2 x i32> %in1.v, <2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>);
+  call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %in1.v, <2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>);
  ; Read in1 from the allocas
  ; This gather should alias the scatter we just saw
-  %tmp.v.0 = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  %tmp.v.0 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
  ; Store in2 to the allocas
-  call void @llvm.masked.scatter.v2i32(<2 x i32> %in2.v, <2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>);
+  call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %in2.v, <2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>);
  ; Read in2 from the allocas
  ; This gather should alias the scatter we just saw, and not be eliminated
-  %tmp.v.1 = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  %tmp.v.1 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
  ; Store in2 to out for good measure
  %tmp.v.1.0 = extractelement <2 x i32> %tmp.v.1, i32 0
  %tmp.v.1.1 = extractelement <2 x i32> %tmp.v.1, i32 1
--- a/test/Transforms/InstCombine/masked_intrinsics.ll
+++ b/test/Transforms/InstCombine/masked_intrinsics.ll
@ -2,8 +2,8 @@

 declare <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptrs, i32, <2 x i1> %mask, <2 x double> %src0)
 declare void @llvm.masked.store.v2f64.p0v2f64(<2 x double> %val, <2 x double>* %ptrs, i32, <2 x i1> %mask)
-declare <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %passthru)
-declare void @llvm.masked.scatter.v2f64(<2 x double> %val, <2 x double*> %ptrs, i32, <2 x i1> %mask)
+declare <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32, <2 x i1> %mask, <2 x double> %passthru)
+declare void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %val, <2 x double*> %ptrs, i32, <2 x i1> %mask)

 define <2 x double> @load_zeromask(<2 x double>* %ptr, <2 x double> %passthru)  {
  %res = call <2 x double> @llvm.masked.load.v2f64.p0v2f64(<2 x double>* %ptr, i32 1, <2 x i1> zeroinitializer, <2 x double> %passthru)
@ -49,7 +49,7 @@ define void @store_onemask(<2 x double>* %ptr, <2 x double> %val)  {
 }

 define <2 x double> @gather_zeromask(<2 x double*> %ptrs, <2 x double> %passthru)  {
-  %res = call <2 x double> @llvm.masked.gather.v2f64(<2 x double*> %ptrs, i32 5, <2 x i1> zeroinitializer, <2 x double> %passthru)
+  %res = call <2 x double> @llvm.masked.gather.v2f64.v2p0f64(<2 x double*> %ptrs, i32 5, <2 x i1> zeroinitializer, <2 x double> %passthru)
  ret <2 x double> %res

 ; CHECK-LABEL: @gather_zeromask(
@ -57,7 +57,7 @@ define <2 x double> @gather_zeromask(<2 x double*> %ptrs, <2 x double> %passthru
 }

 define void @scatter_zeromask(<2 x double*> %ptrs, <2 x double> %val)  {
-  call void @llvm.masked.scatter.v2f64(<2 x double> %val, <2 x double*> %ptrs, i32 6, <2 x i1> zeroinitializer)
+  call void @llvm.masked.scatter.v2f64.v2p0f64(<2 x double> %val, <2 x double*> %ptrs, i32 6, <2 x i1> zeroinitializer)
  ret void

 ; CHECK-LABEL: @scatter_zeromask(
--- a/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
+++ b/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@ -36,7 +36,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <80 x float>, <80 x float>* [[TMP4]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <80 x float> [[WIDE_VEC1]], <80 x float> undef, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <16 x float> [[STRIDED_VEC2]], [[TMP2]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16f32(<16 x float> [[TMP5]], <16 x float*> [[TMP3]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> [[TMP5]], <16 x float*> [[TMP3]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80>
 ; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
--- a/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@ -17,9 +17,9 @@ target triple = "x86_64-pc_linux"
 ;}

 ;AVX512-LABEL: @foo1
-;AVX512: llvm.masked.load.v16i32
-;AVX512: llvm.masked.gather.v16f32
-;AVX512: llvm.masked.store.v16f32
+;AVX512: llvm.masked.load.v16i32.p0v16i32
+;AVX512: llvm.masked.gather.v16f32.v16p0f32
+;AVX512: llvm.masked.store.v16f32.p0v16f32
 ;AVX512: ret void

 ; Function Attrs: nounwind uwtable
@ -96,8 +96,8 @@ for.end:                                          ; preds = %for.cond

 ;AVX512-LABEL: @foo2
 ;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1
-;AVX512: llvm.masked.gather.v16f32
-;AVX512: llvm.masked.scatter.v16f32
+;AVX512: llvm.masked.gather.v16f32.v16p0f32
+;AVX512: llvm.masked.scatter.v16f32.v16p0f32
 ;AVX512: ret void
 define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
 entry:
@ -171,10 +171,10 @@ for.end:                                          ; preds = %for.cond

 ;AVX512-LABEL: @foo3
 ;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1
-;AVX512: llvm.masked.gather.v16f32
+;AVX512: llvm.masked.gather.v16f32.v16p0f32
 ;AVX512: fadd <16 x float>
 ;AVX512: getelementptr inbounds %struct.Out, %struct.Out* %out, <16 x i64> {{.*}}, i32 1
-;AVX512: llvm.masked.scatter.v16f32
+;AVX512: llvm.masked.scatter.v16f32.v16p0f32
 ;AVX512: ret void

 %struct.Out = type { float, float }
@ -233,4 +233,194 @@ for.inc:                                          ; preds = %if.end
 for.end:                                          ; preds = %for.cond
  ret void
 }
-declare void @llvm.masked.scatter.v16f32(<16 x float>, <16 x float*>, i32, <16 x i1>)
+declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float>, <16 x float*>, i32, <16 x i1>)
+
+; The same as @foo2 but scatter/gather argument is a vecotr of ptrs with addresspace 1
+
+;AVX512-LABEL: @foo2_addrspace
+;AVX512: getelementptr inbounds %struct.In, %struct.In addrspace(1)* %in, <16 x i64> {{.*}}, i32 1
+;AVX512: llvm.masked.gather.v16f32.v16p1f32
+;AVX512: llvm.masked.scatter.v16f32.v16p1f32
+;AVX512: ret void
+define void @foo2_addrspace(%struct.In addrspace(1)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
+entry:
+  %in.addr = alloca %struct.In addrspace(1)*, align 8
+  %out.addr = alloca float addrspace(1)*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In addrspace(1)* %in, %struct.In addrspace(1)** %in.addr, align 8
+  store float addrspace(1)* %out, float addrspace(1)** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In addrspace(1)*, %struct.In addrspace(1)** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %arrayidx3, i32 0, i32 1
+  %6 = load float, float addrspace(1)* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load float addrspace(1)*, float addrspace(1)** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %8, i64 %idxprom4
+  store float %add, float addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Same as foo2_addrspace but here only the input has the non-default address space.
+
+;AVX512-LABEL: @foo2_addrspace2
+;AVX512: getelementptr inbounds %struct.In, %struct.In addrspace(1)* %in, <16 x i64> {{.*}}, i32 1
+;AVX512: llvm.masked.gather.v16f32.v16p1f32
+;AVX512: llvm.masked.scatter.v16f32.v16p0f32
+;AVX512: ret void
+define void @foo2_addrspace2(%struct.In addrspace(1)* noalias %in, float addrspace(0)* noalias %out, i32* noalias %trigger, i32* noalias %index) {
+entry:
+  %in.addr = alloca %struct.In addrspace(1)*, align 8
+  %out.addr = alloca float addrspace(0)*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In addrspace(1)* %in, %struct.In addrspace(1)** %in.addr, align 8
+  store float addrspace(0)* %out, float addrspace(0)** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In addrspace(1)*, %struct.In addrspace(1)** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In addrspace(1)* %arrayidx3, i32 0, i32 1
+  %6 = load float, float addrspace(1)* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load float addrspace(0)*, float addrspace(0)** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float addrspace(0)* %8, i64 %idxprom4
+  store float %add, float addrspace(0)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; Same as foo2_addrspace but here only the output has the non-default address space.
+
+;AVX512-LABEL: @foo2_addrspace3
+;AVX512: getelementptr inbounds %struct.In, %struct.In* %in, <16 x i64> {{.*}}, i32 1
+;AVX512: llvm.masked.gather.v16f32.v16p0f32
+;AVX512: llvm.masked.scatter.v16f32.v16p1f32
+;AVX512: ret void
+
+define void @foo2_addrspace3(%struct.In addrspace(0)* noalias %in, float addrspace(1)* noalias %out, i32* noalias %trigger, i32* noalias %index) {
+entry:
+  %in.addr = alloca %struct.In addrspace(0)*, align 8
+  %out.addr = alloca float addrspace(1)*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In addrspace(0)* %in, %struct.In addrspace(0)** %in.addr, align 8
+  store float addrspace(1)* %out, float addrspace(1)** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In addrspace(0)*, %struct.In addrspace(0)** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In addrspace(0)* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In addrspace(0)* %arrayidx3, i32 0, i32 1
+  %6 = load float, float addrspace(0)* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load float addrspace(1)*, float addrspace(1)** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float addrspace(1)* %8, i64 %idxprom4
+  store float %add, float addrspace(1)* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 16
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
--- a/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@ -23,11 +23,11 @@ define void @_Z3fn1v() #0 {
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, <16 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP12]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP13]], i32 16, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP13]], i32 16, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i64> [[VEC_IND3]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
 ; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <16 x i64> [[TMP10]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP15]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP16]], i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32.v16p0i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP16]], i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
 ; CHECK-NEXT:    [[VEC_IND_NEXT4]] = add <16 x i64> [[VEC_IND3]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
--- a/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather.ll
+++ b/test/Transforms/NewGVN/2016-08-30-MaskedScatterGather.ll
@ -1,8 +1,8 @@
 ; XFAIL: *
 ; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s

-declare void @llvm.masked.scatter.v2i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
-declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)
+declare void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> , <2 x i32*> , i32 , <2 x i1> )
+declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>)

 ; This test ensures that masked scatter and gather operations, which take vectors of pointers,
 ; do not have pointer aliasing ignored when being processed.
@ -21,18 +21,18 @@ entry:
  %tmp.i = insertelement <2 x i32*> undef, i32* %tmp.0, i32 0
  %tmp = insertelement <2 x i32*> %tmp.i, i32* %tmp.1, i32 1
  ; Read from in1 and in2
-  %in1.v = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %in1, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
-  %in2.v = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %in2, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  %in1.v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %in1, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  %in2.v = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %in2, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
  ; Store in1 to the allocas
-  call void @llvm.masked.scatter.v2i32(<2 x i32> %in1.v, <2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>);
+  call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %in1.v, <2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>);
  ; Read in1 from the allocas
  ; This gather should alias the scatter we just saw
-  %tmp.v.0 = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  %tmp.v.0 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
  ; Store in2 to the allocas
-  call void @llvm.masked.scatter.v2i32(<2 x i32> %in2.v, <2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>);
+  call void @llvm.masked.scatter.v2i32.v2p0i32(<2 x i32> %in2.v, <2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>);
  ; Read in2 from the allocas
  ; This gather should alias the scatter we just saw, and not be eliminated
-  %tmp.v.1 = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
+  %tmp.v.1 = call <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*> %tmp, i32 1, <2 x i1> <i1 true, i1 true>, <2 x i32> undef) #1
  ; Store in2 to out for good measure
  %tmp.v.1.0 = extractelement <2 x i32> %tmp.v.1, i32 0
  %tmp.v.1.1 = extractelement <2 x i32> %tmp.v.1, i32 1
--- a/test/Verifier/scatter_gather.ll
+++ b/test/Verifier/scatter_gather.ll
@ -0,0 +1,122 @@
+; RUN: not opt -verify < %s 2>&1 | FileCheck %s
+
+; Mask is not a vector
+; CHECK: Intrinsic has incorrect argument type!
+define <16 x float> @gather2(<16 x float*> %ptrs, <16 x i1>* %mask, <16 x float> %passthru) {
+  %res = call <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*> %ptrs, i32 4, <16 x i1>* %mask, <16 x float> %passthru)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0f32(<16 x float*>, i32, <16 x i1>*, <16 x float>)
+
+; Mask length != return length
+; CHECK: Intrinsic has incorrect argument type!
+define <8 x float> @gather3(<8 x float*> %ptrs, <16 x i1> %mask, <8 x float> %passthru) {
+  %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <16 x i1> %mask, <8 x float> %passthru)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <16 x i1>, <8 x float>)
+
+; Return type is not a vector
+; CHECK: Intrinsic has incorrect return type!
+define <8 x float>* @gather4(<8 x float*> %ptrs, <8 x i1> %mask, <8 x float> %passthru) {
+  %res = call <8 x float>* @llvm.masked.gather.p0v8f32.v8p0f32(<8 x float*> %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru)
+  ret <8 x float>* %res
+}
+declare <8 x float>* @llvm.masked.gather.p0v8f32.v8p0f32(<8 x float*>, i32, <8 x i1>, <8 x float>)
+
+; Value type is not a vector
+; CHECK: Intrinsic has incorrect argument type!
+define <8 x float> @gather5(<8 x float*>* %ptrs, <8 x i1> %mask, <8 x float> %passthru) {
+  %res = call <8 x float> @llvm.masked.gather.v8f32.p0v8p0f32(<8 x float*>* %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.masked.gather.v8f32.p0v8p0f32(<8 x float*>*, i32, <8 x i1>, <8 x float>)
+
+; Value type is not a vector of pointers
+; CHECK: Intrinsic has incorrect argument type!
+define <8 x float> @gather6(<8 x float> %ptrs, <8 x i1> %mask, <8 x float> %passthru) {
+  %res = call <8 x float> @llvm.masked.gather.v8f32.v8f32(<8 x float> %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.masked.gather.v8f32.v8f32(<8 x float>, i32, <8 x i1>, <8 x float>)
+
+; Value element type != vector of pointers element
+; CHECK: Intrinsic has incorrect argument type!
+define <8 x float> @gather7(<8 x double*> %ptrs, <8 x i1> %mask, <8 x float> %passthru) {
+  %res = call <8 x float> @llvm.masked.gather.v8f32.v8p0f64(<8 x double*> %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.masked.gather.v8f32.v8p0f64(<8 x double*>, i32, <8 x i1>, <8 x float>)
+
+; Value length!= vector of pointers length
+; CHECK: Intrinsic has incorrect argument type!
+define <8 x float> @gather8(<16 x float*> %ptrs, <8 x i1> %mask, <8 x float> %passthru) {
+  %res = call <8 x float> @llvm.masked.gather.v8f32.v16p0f32(<16 x float*> %ptrs, i32 4, <8 x i1> %mask, <8 x float> %passthru)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.masked.gather.v8f32.v16p0f32(<16 x float*>, i32, <8 x i1>, <8 x float>)
+
+; Passthru type doesn't match return type 
+; CHECK: Intrinsic has incorrect argument type!
+define <16 x i32> @gather9(<16 x i32*> %ptrs, <16 x i1> %mask, <8 x i32> %passthru) {
+  %res = call <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*> %ptrs, i32 4, <16 x i1> %mask, <8 x i32> %passthru)
+  ret <16 x i32> %res
+}
+declare <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*>, i32, <16 x i1>, <8 x i32>)
+
+; Mask is not a vector
+; CHECK: Intrinsic has incorrect argument type!
+define void @scatter2(<16 x float> %value, <16 x float*> %ptrs, <16 x i1>* %mask) {
+  call void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float> %value, <16 x float*> %ptrs, i32 4, <16 x i1>* %mask)
+  ret void
+}
+declare void @llvm.masked.scatter.v16f32.v16p0f32(<16 x float>, <16 x float*>, i32, <16 x i1>*)
+
+; Mask length != value length
+; CHECK: Intrinsic has incorrect argument type!
+define void @scatter3(<8 x float> %value, <8 x float*> %ptrs, <16 x i1> %mask) {
+  call void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float> %value, <8 x float*> %ptrs, i32 4, <16 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.scatter.v8f32.v8p0f32(<8 x float>, <8 x float*>, i32, <16 x i1>)
+
+; Value type is not a vector
+; CHECK: Intrinsic has incorrect argument type!
+define void @scatter4(<8 x float>* %value, <8 x float*> %ptrs, <8 x i1> %mask) {
+  call void @llvm.masked.scatter.p0v8f32.v8p0f32(<8 x float>* %value, <8 x float*> %ptrs, i32 4, <8 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.scatter.p0v8f32.v8p0f32(<8 x float>*, <8 x float*>, i32, <8 x i1>)
+
+; ptrs is not a vector
+; CHECK: Intrinsic has incorrect argument type!
+define void @scatter5(<8 x float> %value, <8 x float*>* %ptrs, <8 x i1> %mask) {
+  call void @llvm.masked.scatter.v8f32.p0v8p0f32(<8 x float> %value, <8 x float*>* %ptrs, i32 4, <8 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.scatter.v8f32.p0v8p0f32(<8 x float>, <8 x float*>*, i32, <8 x i1>)
+
+; Value type is not a vector of pointers
+; CHECK: Intrinsic has incorrect argument type!
+define void @scatter6(<8 x float> %value, <8 x float> %ptrs, <8 x i1> %mask) {
+  call void @llvm.masked.scatter.v8f32.v8f32(<8 x float> %value, <8 x float> %ptrs, i32 4, <8 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.scatter.v8f32.v8f32(<8 x float>, <8 x float>, i32, <8 x i1>)
+
+; Value element type != vector of pointers element
+; CHECK: Intrinsic has incorrect argument type!
+define void @scatter7(<8 x float> %value, <8 x double*> %ptrs, <8 x i1> %mask) {
+  call void @llvm.masked.scatter.v8f32.v8p0f64(<8 x float> %value, <8 x double*> %ptrs, i32 4, <8 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.scatter.v8f32.v8p0f64(<8 x float>, <8 x double*>, i32, <8 x i1>)
+
+; Value length!= vector of pointers length
+; CHECK: Intrinsic has incorrect argument type!
+define void @scatter8(<8 x float> %value, <16 x float*> %ptrs, <8 x i1> %mask) {
+  call void @llvm.masked.scatter.v8f32.v16p0f32(<8 x float> %value, <16 x float*> %ptrs, i32 4, <8 x i1> %mask)
+  ret void
+}
+declare void @llvm.masked.scatter.v8f32.v16p0f32(<8 x float>, <16 x float*>, i32, <8 x i1>)
+
--- a/utils/TableGen/IntrinsicEmitter.cpp
+++ b/utils/TableGen/IntrinsicEmitter.cpp
@ -211,13 +211,12 @@ enum IIT_Info {
  IIT_SAME_VEC_WIDTH_ARG = 31,
  IIT_PTR_TO_ARG = 32,
  IIT_PTR_TO_ELT = 33,
-  IIT_VEC_OF_PTRS_TO_ELT = 34,
+  IIT_VEC_OF_ANYPTRS_TO_ELT = 34,
  IIT_I128 = 35,
  IIT_V512 = 36,
  IIT_V1024 = 37
 };

-
 static void EncodeFixedValueType(MVT::SimpleValueType VT,
                                 std::vector<unsigned char> &Sig) {
  if (MVT(VT).isInteger()) {
@ -273,9 +272,16 @@ static void EncodeFixedType(Record *R, std::vector<unsigned char> &ArgCodes,
    }
    else if (R->isSubClassOf("LLVMPointerTo"))
      Sig.push_back(IIT_PTR_TO_ARG);
-    else if (R->isSubClassOf("LLVMVectorOfPointersToElt"))
-      Sig.push_back(IIT_VEC_OF_PTRS_TO_ELT);
-    else if (R->isSubClassOf("LLVMPointerToElt"))
+    else if (R->isSubClassOf("LLVMVectorOfAnyPointersToElt")) {
+      Sig.push_back(IIT_VEC_OF_ANYPTRS_TO_ELT);
+      unsigned ArgNo = ArgCodes.size();
+      ArgCodes.push_back(3 /*vAny*/);
+      // Encode overloaded ArgNo
+      Sig.push_back(ArgNo);
+      // Encode LLVMMatchType<Number> ArgNo
+      Sig.push_back(Number);
+      return;
+    } else if (R->isSubClassOf("LLVMPointerToElt"))
      Sig.push_back(IIT_PTR_TO_ELT);
    else
      Sig.push_back(IIT_ARG);