Create masked gather and scatter intrinsics in Loop Vectorizer.

Loop vectorizer now knows to vectorize GEP and create masked gather and scatter intrinsics for random memory access. The feature is enabled on AVX-512 target. Differential Revision: http://reviews.llvm.org/D15690 llvm-svn: 261140
2025-02-01 05:01:59 +01:00 · 2016-02-17 19:23:04 +00:00 · 2016-02-17 19:23:04 +00:00 · fdd98d5776
commit fdd98d5776
parent 14d2c58ecf
5 changed files with 497 additions and 109 deletions
--- a/include/llvm/IR/IRBuilder.h
+++ b/include/llvm/IR/IRBuilder.h
@ -436,6 +436,14 @@ public:
  CallInst *CreateMaskedStore(Value *Val, Value *Ptr, unsigned Align,
                              Value *Mask);

+  /// \brief Create a call to Masked Gather intrinsic
+  CallInst *CreateMaskedGather(Value *Ptrs, unsigned Align, Value *Mask = 0,
+                               Value *PassThru = 0, const Twine& Name = "");
+
+  /// \brief Create a call to Masked Scatter intrinsic
+  CallInst *CreateMaskedScatter(Value *Val, Value *Ptrs, unsigned Align,
+                                Value *Mask = 0);
+
  /// \brief Create an assume intrinsic call that allows the optimizer to
  /// assume that the provided condition will be true.
  CallInst *CreateAssumption(Value *Cond);
--- a/lib/IR/IRBuilder.cpp
+++ b/lib/IR/IRBuilder.cpp
@ -201,18 +201,17 @@ CallInst *IRBuilderBase::CreateAssumption(Value *Cond) {
  return createCallHelper(FnAssume, Ops, this);
 }

-/// Create a call to a Masked Load intrinsic.
-/// Ptr      - the base pointer for the load
-/// Align    - alignment of the source location
-/// Mask     - an vector of booleans which indicates what vector lanes should
+/// \brief Create a call to a Masked Load intrinsic.
+/// \p Ptr      - base pointer for the load
+/// \p Align    - alignment of the source location
+/// \p Mask     - vector of booleans which indicates what vector lanes should
 ///               be accessed in memory
-/// PassThru - a pass-through value that is used to fill the masked-off lanes
+/// \p PassThru - pass-through value that is used to fill the masked-off lanes
 ///               of the result
-/// Name     - name of the result variable
+/// \p Name     - name of the result variable
 CallInst *IRBuilderBase::CreateMaskedLoad(Value *Ptr, unsigned Align,
                                          Value *Mask, Value *PassThru,
                                          const Twine &Name) {
-  assert(Ptr->getType()->isPointerTy() && "Ptr must be of pointer type");
  // DataTy is the overloaded type
  Type *DataTy = cast<PointerType>(Ptr->getType())->getElementType();
  assert(DataTy->isVectorTy() && "Ptr should point to a vector");
@ -222,11 +221,11 @@ CallInst *IRBuilderBase::CreateMaskedLoad(Value *Ptr, unsigned Align,
  return CreateMaskedIntrinsic(Intrinsic::masked_load, Ops, DataTy, Name);
 }

-/// Create a call to a Masked Store intrinsic.
-/// Val   - the data to be stored,
-/// Ptr   - the base pointer for the store
-/// Align - alignment of the destination location
-/// Mask  - an vector of booleans which indicates what vector lanes should
+/// \brief Create a call to a Masked Store intrinsic.
+/// \p Val   - data to be stored,
+/// \p Ptr   - base pointer for the store
+/// \p Align - alignment of the destination location
+/// \p Mask  - vector of booleans which indicates what vector lanes should
 ///            be accessed in memory
 CallInst *IRBuilderBase::CreateMaskedStore(Value *Val, Value *Ptr,
                                           unsigned Align, Value *Mask) {
@ -247,6 +246,62 @@ CallInst *IRBuilderBase::CreateMaskedIntrinsic(Intrinsic::ID Id,
  return createCallHelper(TheFn, Ops, this, Name);
 }

+/// \brief Create a call to a Masked Gather intrinsic.
+/// \p Ptrs     - vector of pointers for loading
+/// \p Align    - alignment for one element
+/// \p Mask     - vector of booleans which indicates what vector lanes should
+///               be accessed in memory
+/// \p PassThru - pass-through value that is used to fill the masked-off lanes
+///               of the result
+/// \p Name     - name of the result variable
+CallInst *IRBuilderBase::CreateMaskedGather(Value *Ptrs, unsigned Align,
+                                            Value *Mask,  Value *PassThru,
+                                            const Twine& Name) {
+  auto PtrsTy = cast<VectorType>(Ptrs->getType());
+  auto PtrTy = cast<PointerType>(PtrsTy->getElementType());
+  unsigned NumElts = PtrsTy->getVectorNumElements();
+  Type *DataTy = VectorType::get(PtrTy->getElementType(), NumElts);
+
+  if (!Mask)
+    Mask = Constant::getAllOnesValue(VectorType::get(Type::getInt1Ty(Context),
+                                     NumElts));
+
+  Value * Ops[] = {Ptrs, getInt32(Align), Mask, UndefValue::get(DataTy)};
+
+  // We specify only one type when we create this intrinsic. Types of other
+  // arguments are derived from this type.
+  return CreateMaskedIntrinsic(Intrinsic::masked_gather, Ops, DataTy, Name);
+}
+
+/// \brief Create a call to a Masked Scatter intrinsic.
+/// \p Data  - data to be stored,
+/// \p Ptrs  - the vector of pointers, where the \p Data elements should be
+///            stored
+/// \p Align - alignment for one element
+/// \p Mask  - vector of booleans which indicates what vector lanes should
+///            be accessed in memory
+CallInst *IRBuilderBase::CreateMaskedScatter(Value *Data, Value *Ptrs,
+                                             unsigned Align, Value *Mask) {
+  auto PtrsTy = cast<VectorType>(Ptrs->getType());
+  auto DataTy = cast<VectorType>(Data->getType());
+
+  auto PtrTy = cast<PointerType>(PtrsTy->getElementType());
+  unsigned NumElts = PtrsTy->getVectorNumElements();
+
+  assert(NumElts == DataTy->getVectorNumElements() &&
+         PtrTy->getElementType() ==  DataTy->getElementType() &&
+        "Incompatible pointer and data types");
+
+  if (!Mask)
+    Mask = Constant::getAllOnesValue(VectorType::get(Type::getInt1Ty(Context),
+                                     NumElts));
+  Value * Ops[] = {Data, Ptrs, getInt32(Align), Mask};
+
+  // We specify only one type when we create this intrinsic. Types of other
+  // arguments are derived from this type.
+  return CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, DataTy);
+}
+
 template <typename T0, typename T1, typename T2, typename T3>
 static std::vector<Value *>
 getStatepointArgs(IRBuilderBase &B, uint64_t ID, uint32_t NumPatchBytes,
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@ -1282,6 +1282,17 @@ public:
  bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
    return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
  }
+  /// Returns true if the target machine supports masked scatter operation
+  /// for the given \p DataType.
+  bool isLegalMaskedScatter(Type *DataType) {
+    return TTI->isLegalMaskedScatter(DataType);
+  }
+  /// Returns true if the target machine supports masked gather operation
+  /// for the given \p DataType.
+  bool isLegalMaskedGather(Type *DataType) {
+    return TTI->isLegalMaskedGather(DataType);
+  }
+
  /// Returns true if vector representation of the instruction \p I
  /// requires mask.
  bool isMaskRequired(const Instruction* I) {
@ -2379,19 +2390,28 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
  if (ScalarAllocatedSize != VectorElementSize)
    return scalarizeInstruction(Instr);

-  // If the pointer is loop invariant or if it is non-consecutive,
-  // scalarize the load.
+  // If the pointer is loop invariant scalarize the load.
+  if (LI && Legal->isUniform(Ptr))
+    return scalarizeInstruction(Instr);
+
+  // If the pointer is non-consecutive and gather/scatter is not supported
+  // scalarize the instruction.
  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
  bool Reverse = ConsecutiveStride < 0;
-  bool UniformLoad = LI && Legal->isUniform(Ptr);
-  if (!ConsecutiveStride || UniformLoad)
+  bool CreateGatherScatter = !ConsecutiveStride &&
+    ((LI && Legal->isLegalMaskedGather(ScalarDataTy)) ||
+     (SI && Legal->isLegalMaskedScatter(ScalarDataTy)));
+ 
+  if (!ConsecutiveStride && !CreateGatherScatter)
    return scalarizeInstruction(Instr);

  Constant *Zero = Builder.getInt32(0);
  VectorParts &Entry = WidenMap.get(Instr);
+  VectorParts VectorGep;

  // Handle consecutive loads/stores.
  GetElementPtrInst *Gep = getGEPInstruction(Ptr);
+  if (ConsecutiveStride) {
    if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
      setDebugLocFromInst(Builder, Gep);
        Value *PtrOperand = Gep->getPointerOperand();
@ -2408,7 +2428,6 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
       assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()),
                                           OrigLoop) &&
              "Base ptr must be invariant");
-
        // The last index does not have to be the induction. It can be
        // consecutive and be a function of the index. For example A[I+1];
        unsigned NumOperands = Gep->getNumOperands();
@ -2436,13 +2455,42 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
          }
        }
        Ptr = Builder.Insert(Gep2);
-  } else {
+      } else { // No GEP
        // Use the induction element ptr.
        assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
        setDebugLocFromInst(Builder, Ptr);
        VectorParts &PtrVal = getVectorValue(Ptr);
        Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
      }
+    } else {
+      // At this point we should vector version of GEP for Gather or Scatter
+      assert(CreateGatherScatter && "The instruction should be scalarized");
+      if (Gep) {
+        SmallVector<VectorParts, 4> OpsV;
+        // Vectorizing GEP, across UF parts, we want to keep each loop-invariant
+        // base or index of GEP scalar
+        for (Value *Op : Gep->operands()) {
+          if (PSE.getSE()->isLoopInvariant(PSE.getSCEV(Op), OrigLoop))
+            OpsV.push_back(VectorParts(UF, Op));
+          else
+            OpsV.push_back(getVectorValue(Op));
+        }
+
+        for (unsigned Part = 0; Part < UF; ++Part) {
+          SmallVector<Value*, 4> Ops;
+          Value *GEPBasePtr = OpsV[0][Part];
+          for (unsigned i = 1; i < Gep->getNumOperands(); i++)
+            Ops.push_back(OpsV[i][Part]);
+          Value *NewGep = Builder.CreateGEP(nullptr, GEPBasePtr, Ops,
+                                            "VectorGep");
+          assert(NewGep->getType()->isVectorTy() && "Expected vector GEP");
+          NewGep = Builder.CreateBitCast(NewGep,
+                                         VectorType::get(Ptr->getType(), VF));
+          VectorGep.push_back(NewGep);
+        }
+      } else
+        VectorGep = getVectorValue(Ptr);
+    }

  VectorParts Mask = createBlockInMask(Instr->getParent());
  // Handle Stores:
@ -2455,6 +2503,12 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
    VectorParts StoredVal = getVectorValue(SI->getValueOperand());

    for (unsigned Part = 0; Part < UF; ++Part) {
+      Instruction *NewSI = nullptr;
+      if (CreateGatherScatter) {
+        Value *MaskPart = Legal->isMaskRequired(SI) ? Mask[Part] : nullptr;
+        NewSI = Builder.CreateMaskedScatter(StoredVal[Part], VectorGep[Part],
+                                            Alignment, MaskPart);
+      } else {
        // Calculate the pointer for the specific unroll-part.
        Value *PartPtr =
          Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
@ -2473,12 +2527,13 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
        Value *VecPtr = Builder.CreateBitCast(PartPtr,
                                              DataTy->getPointerTo(AddressSpace));

-      Instruction *NewSI;
        if (Legal->isMaskRequired(SI))
          NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,
                                            Mask[Part]);
        else 
-        NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
+          NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr,
+                                             Alignment);
+      }
      propagateMetadata(NewSI, SI);
    }
    return;
@ -2488,6 +2543,13 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
  assert(LI && "Must have a load instruction");
  setDebugLocFromInst(Builder, LI);
  for (unsigned Part = 0; Part < UF; ++Part) {
+    Instruction* NewLI;
+    if (CreateGatherScatter) {
+      Value *MaskPart = Legal->isMaskRequired(LI) ? Mask[Part] : nullptr;
+      NewLI = Builder.CreateMaskedGather(VectorGep[Part], Alignment,
+                                         MaskPart, 0, "wide.masked.gather");
+      Entry[Part] = NewLI;
+    } else {
      // Calculate the pointer for the specific unroll-part.
      Value *PartPtr =
        Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
@ -2500,7 +2562,6 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
        Mask[Part] = reverseVector(Mask[Part]);
      }

-    Instruction* NewLI;
      Value *VecPtr = Builder.CreateBitCast(PartPtr,
                                            DataTy->getPointerTo(AddressSpace));
      if (Legal->isMaskRequired(LI))
@ -2509,9 +2570,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
                                         "wide.masked.load");
      else
        NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
-    propagateMetadata(NewLI, LI);
      Entry[Part] = Reverse ? reverseVector(NewLI) :  NewLI;
    }
+    propagateMetadata(NewLI, LI);
+  }
 }

 void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
@ -4520,7 +4582,8 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
      if (!LI)
        return false;
      if (!SafePtrs.count(LI->getPointerOperand())) {
-        if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) {
+        if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand()) ||
+            isLegalMaskedGather(LI->getType())) {
          MaskedOp.insert(LI);
          continue;
        }
@ -4545,7 +4608,8 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
        // scalarize the block.
        bool isLegalMaskedOp =
          isLegalMaskedStore(SI->getValueOperand()->getType(),
-                             SI->getPointerOperand());
+                             SI->getPointerOperand()) ||
+          isLegalMaskedScatter(SI->getValueOperand()->getType());
        if (isLegalMaskedOp) {
          --NumPredStores;
          MaskedOp.insert(SI);
@ -5281,6 +5345,19 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
  return Cost;
 }

+/// \brief Check if the load/store instruction \p I may be translated into
+/// gather/scatter during vectorization.
+///
+/// Pointer \p Ptr specifies address in memory for the given scalar memory
+/// instruction. We need it to retrieve data type.
+/// Using gather/scatter is possible when it is supported by target.
+static bool isGatherOrScatterLegal(Instruction *I, Value *Ptr,
+                                   LoopVectorizationLegality *Legal) {
+  Type *DataTy = cast<PointerType>(Ptr->getType())->getElementType();
+  return (isa<LoadInst>(I) && Legal->isLegalMaskedGather(DataTy)) ||
+    (isa<StoreInst>(I) && Legal->isLegalMaskedScatter(DataTy));
+}
+
 /// \brief Check whether the address computation for a non-consecutive memory
 /// access looks like an unlikely candidate for being merged into the indexing
 /// mode.
@ -5500,11 +5577,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {

    // Scalarized loads/stores.
    int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+    bool UseGatherOrScatter = (ConsecutiveStride == 0) &&
+      isGatherOrScatterLegal(I, Ptr, Legal);
+
    bool Reverse = ConsecutiveStride < 0;
    const DataLayout &DL = I->getModule()->getDataLayout();
    unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy);
    unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF;
-    if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
+    if ((!ConsecutiveStride && !UseGatherOrScatter) ||
+        ScalarAllocatedSize != VectorElementSize) {
      bool IsComplexComputation =
        isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
      unsigned Cost = 0;
@ -5528,8 +5609,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
      return Cost;
    }

-    // Wide load/stores.
    unsigned Cost = TTI.getAddressComputationCost(VectorTy);
+    if (UseGatherOrScatter) {
+      assert(ConsecutiveStride == 0 &&
+             "Gather/Scatter are not used for consecutive stride");
+      return Cost +
+        TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
+                                   Legal->isMaskRequired(I), Alignment);
+    }
+    // Wide load/stores.
    if (Legal->isMaskRequired(I))
      Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment,
                                        AS);
--- a/test/Transforms/LoopVectorize/X86/gather_scatter.ll
+++ b/test/Transforms/LoopVectorize/X86/gather_scatter.ll
@ -0,0 +1,236 @@
+; RUN: opt < %s  -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
+
+;AVX1-NOT: llvm.masked
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc_linux"
+
+; The source code:
+;
+;void foo1(float * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger, int * __restrict__ index) {
+;
+;  for (int i=0; i < SIZE; ++i) {
+;    if (trigger[i] > 0) {
+;      out[i] = in[index[i]] + (float) 0.5;
+;    }
+;  }
+;}
+
+;AVX512-LABEL: @foo1
+;AVX512:  llvm.masked.load.v8i32
+;AVX512: llvm.masked.gather.v8f32
+;AVX512: llvm.masked.store.v8f32
+;AVX512: ret void
+
+; Function Attrs: nounwind uwtable
+define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) {
+entry:
+  %in.addr = alloca float*, align 8
+  %out.addr = alloca float*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store float* %in, float** %in.addr, align 8
+  store float* %out, float** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load i32*, i32** %index.addr, align 8
+  %arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2
+  %6 = load i32, i32* %arrayidx3, align 4
+  %idxprom4 = sext i32 %6 to i64
+  %7 = load float*, float** %in.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float* %7, i64 %idxprom4
+  %8 = load float, float* %arrayidx5, align 4
+  %add = fadd float %8, 5.000000e-01
+  %9 = load i32, i32* %i, align 4
+  %idxprom6 = sext i32 %9 to i64
+  %10 = load float*, float** %out.addr, align 8
+  %arrayidx7 = getelementptr inbounds float, float* %10, i64 %idxprom6
+  store float %add, float* %arrayidx7, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %11 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %11, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The source code
+;void foo2 (In * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger) {
+;
+;  for (int i=0; i<SIZE; ++i) {
+;    if (trigger[i] > 0) {
+;      out[i] = in[i].b + (float) 0.5;
+;    }
+;  }
+;}
+
+%struct.In = type { float, float }
+
+;AVX512-LABEL: @foo2
+;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %induction, i32 1
+;AVX512: llvm.masked.gather.v16f32
+;AVX512: llvm.masked.store.v16f32
+;AVX512: ret void
+define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
+entry:
+  %in.addr = alloca %struct.In*, align 8
+  %out.addr = alloca float*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %index.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In* %in, %struct.In** %in.addr, align 8
+  store float* %out, float** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32* %index, i32** %index.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In*, %struct.In** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1
+  %6 = load float, float* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load float*, float** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds float, float* %8, i64 %idxprom4
+  store float %add, float* %arrayidx5, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+
+; The source code
+;struct Out {
+;  float a;
+;  float b;
+;};
+;void foo3 (In * __restrict__ in, Out * __restrict__ out, int * __restrict__ trigger) {
+;
+;  for (int i=0; i<SIZE; ++i) {
+;    if (trigger[i] > 0) {
+;      out[i].b = in[i].b + (float) 0.5;
+;    }
+;  }
+;}
+
+;AVX512-LABEL: @foo3
+;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %induction, i32 1
+;AVX512: llvm.masked.gather.v16f32
+;AVX512: fadd <16 x float>
+;AVX512: getelementptr %struct.Out, %struct.Out* %out, <16 x i64> %induction, i32 1
+;AVX512: llvm.masked.scatter.v16f32
+;AVX512: ret void
+
+%struct.Out = type { float, float }
+
+define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noalias %trigger) {
+entry:
+  %in.addr = alloca %struct.In*, align 8
+  %out.addr = alloca %struct.Out*, align 8
+  %trigger.addr = alloca i32*, align 8
+  %i = alloca i32, align 4
+  store %struct.In* %in, %struct.In** %in.addr, align 8
+  store %struct.Out* %out, %struct.Out** %out.addr, align 8
+  store i32* %trigger, i32** %trigger.addr, align 8
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 4096
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load i32, i32* %i, align 4
+  %idxprom = sext i32 %1 to i64
+  %2 = load i32*, i32** %trigger.addr, align 8
+  %arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
+  %3 = load i32, i32* %arrayidx, align 4
+  %cmp1 = icmp sgt i32 %3, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:                                          ; preds = %for.body
+  %4 = load i32, i32* %i, align 4
+  %idxprom2 = sext i32 %4 to i64
+  %5 = load %struct.In*, %struct.In** %in.addr, align 8
+  %arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2
+  %b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1
+  %6 = load float, float* %b, align 4
+  %add = fadd float %6, 5.000000e-01
+  %7 = load i32, i32* %i, align 4
+  %idxprom4 = sext i32 %7 to i64
+  %8 = load %struct.Out*, %struct.Out** %out.addr, align 8
+  %arrayidx5 = getelementptr inbounds %struct.Out, %struct.Out* %8, i64 %idxprom4
+  %b6 = getelementptr inbounds %struct.Out, %struct.Out* %arrayidx5, i32 0, i32 1
+  store float %add, float* %b6, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.end
+  %9 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %9, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
+declare void @llvm.masked.scatter.v16f32(<16 x float>, <16 x float*>, i32, <16 x i1>)
--- a/test/Transforms/LoopVectorize/X86/masked_load_store.ll
+++ b/test/Transforms/LoopVectorize/X86/masked_load_store.ll
@ -278,7 +278,8 @@ for.end:                                          ; preds = %for.cond
 ;AVX: ret void

 ;AVX512-LABEL: @foo4
-;AVX512-NOT: llvm.masked
+;AVX512-NOT: llvm.masked.load
+;AVX512: llvm.masked.gather
 ;AVX512: ret void

 ; Function Attrs: nounwind uwtable