1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2025-02-01 05:01:59 +01:00

Create masked gather and scatter intrinsics in Loop Vectorizer.

Loop vectorizer now knows to vectorize GEP and create masked gather and scatter intrinsics for random memory access.

The feature is enabled on AVX-512 target.
Differential Revision: http://reviews.llvm.org/D15690

llvm-svn: 261140
This commit is contained in:
Elena Demikhovsky 2016-02-17 19:23:04 +00:00
parent 14d2c58ecf
commit fdd98d5776
5 changed files with 497 additions and 109 deletions

View File

@ -436,6 +436,14 @@ public:
CallInst *CreateMaskedStore(Value *Val, Value *Ptr, unsigned Align,
Value *Mask);
/// \brief Create a call to Masked Gather intrinsic
CallInst *CreateMaskedGather(Value *Ptrs, unsigned Align, Value *Mask = 0,
Value *PassThru = 0, const Twine& Name = "");
/// \brief Create a call to Masked Scatter intrinsic
CallInst *CreateMaskedScatter(Value *Val, Value *Ptrs, unsigned Align,
Value *Mask = 0);
/// \brief Create an assume intrinsic call that allows the optimizer to
/// assume that the provided condition will be true.
CallInst *CreateAssumption(Value *Cond);

View File

@ -201,18 +201,17 @@ CallInst *IRBuilderBase::CreateAssumption(Value *Cond) {
return createCallHelper(FnAssume, Ops, this);
}
/// Create a call to a Masked Load intrinsic.
/// Ptr - the base pointer for the load
/// Align - alignment of the source location
/// Mask - an vector of booleans which indicates what vector lanes should
/// \brief Create a call to a Masked Load intrinsic.
/// \p Ptr - base pointer for the load
/// \p Align - alignment of the source location
/// \p Mask - vector of booleans which indicates what vector lanes should
/// be accessed in memory
/// PassThru - a pass-through value that is used to fill the masked-off lanes
/// \p PassThru - pass-through value that is used to fill the masked-off lanes
/// of the result
/// Name - name of the result variable
/// \p Name - name of the result variable
CallInst *IRBuilderBase::CreateMaskedLoad(Value *Ptr, unsigned Align,
Value *Mask, Value *PassThru,
const Twine &Name) {
assert(Ptr->getType()->isPointerTy() && "Ptr must be of pointer type");
// DataTy is the overloaded type
Type *DataTy = cast<PointerType>(Ptr->getType())->getElementType();
assert(DataTy->isVectorTy() && "Ptr should point to a vector");
@ -222,11 +221,11 @@ CallInst *IRBuilderBase::CreateMaskedLoad(Value *Ptr, unsigned Align,
return CreateMaskedIntrinsic(Intrinsic::masked_load, Ops, DataTy, Name);
}
/// Create a call to a Masked Store intrinsic.
/// Val - the data to be stored,
/// Ptr - the base pointer for the store
/// Align - alignment of the destination location
/// Mask - an vector of booleans which indicates what vector lanes should
/// \brief Create a call to a Masked Store intrinsic.
/// \p Val - data to be stored,
/// \p Ptr - base pointer for the store
/// \p Align - alignment of the destination location
/// \p Mask - vector of booleans which indicates what vector lanes should
/// be accessed in memory
CallInst *IRBuilderBase::CreateMaskedStore(Value *Val, Value *Ptr,
unsigned Align, Value *Mask) {
@ -247,6 +246,62 @@ CallInst *IRBuilderBase::CreateMaskedIntrinsic(Intrinsic::ID Id,
return createCallHelper(TheFn, Ops, this, Name);
}
/// \brief Create a call to a Masked Gather intrinsic.
/// \p Ptrs - vector of pointers for loading
/// \p Align - alignment for one element
/// \p Mask - vector of booleans which indicates what vector lanes should
/// be accessed in memory
/// \p PassThru - pass-through value that is used to fill the masked-off lanes
/// of the result
/// \p Name - name of the result variable
CallInst *IRBuilderBase::CreateMaskedGather(Value *Ptrs, unsigned Align,
Value *Mask, Value *PassThru,
const Twine& Name) {
auto PtrsTy = cast<VectorType>(Ptrs->getType());
auto PtrTy = cast<PointerType>(PtrsTy->getElementType());
unsigned NumElts = PtrsTy->getVectorNumElements();
Type *DataTy = VectorType::get(PtrTy->getElementType(), NumElts);
if (!Mask)
Mask = Constant::getAllOnesValue(VectorType::get(Type::getInt1Ty(Context),
NumElts));
Value * Ops[] = {Ptrs, getInt32(Align), Mask, UndefValue::get(DataTy)};
// We specify only one type when we create this intrinsic. Types of other
// arguments are derived from this type.
return CreateMaskedIntrinsic(Intrinsic::masked_gather, Ops, DataTy, Name);
}
/// \brief Create a call to a Masked Scatter intrinsic.
/// \p Data - data to be stored,
/// \p Ptrs - the vector of pointers, where the \p Data elements should be
/// stored
/// \p Align - alignment for one element
/// \p Mask - vector of booleans which indicates what vector lanes should
/// be accessed in memory
CallInst *IRBuilderBase::CreateMaskedScatter(Value *Data, Value *Ptrs,
unsigned Align, Value *Mask) {
auto PtrsTy = cast<VectorType>(Ptrs->getType());
auto DataTy = cast<VectorType>(Data->getType());
auto PtrTy = cast<PointerType>(PtrsTy->getElementType());
unsigned NumElts = PtrsTy->getVectorNumElements();
assert(NumElts == DataTy->getVectorNumElements() &&
PtrTy->getElementType() == DataTy->getElementType() &&
"Incompatible pointer and data types");
if (!Mask)
Mask = Constant::getAllOnesValue(VectorType::get(Type::getInt1Ty(Context),
NumElts));
Value * Ops[] = {Data, Ptrs, getInt32(Align), Mask};
// We specify only one type when we create this intrinsic. Types of other
// arguments are derived from this type.
return CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, DataTy);
}
template <typename T0, typename T1, typename T2, typename T3>
static std::vector<Value *>
getStatepointArgs(IRBuilderBase &B, uint64_t ID, uint32_t NumPatchBytes,

View File

@ -1282,6 +1282,17 @@ public:
bool isLegalMaskedLoad(Type *DataType, Value *Ptr) {
return isConsecutivePtr(Ptr) && TTI->isLegalMaskedLoad(DataType);
}
/// Returns true if the target machine supports masked scatter operation
/// for the given \p DataType.
bool isLegalMaskedScatter(Type *DataType) {
return TTI->isLegalMaskedScatter(DataType);
}
/// Returns true if the target machine supports masked gather operation
/// for the given \p DataType.
bool isLegalMaskedGather(Type *DataType) {
return TTI->isLegalMaskedGather(DataType);
}
/// Returns true if vector representation of the instruction \p I
/// requires mask.
bool isMaskRequired(const Instruction* I) {
@ -2379,19 +2390,28 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
if (ScalarAllocatedSize != VectorElementSize)
return scalarizeInstruction(Instr);
// If the pointer is loop invariant or if it is non-consecutive,
// scalarize the load.
// If the pointer is loop invariant scalarize the load.
if (LI && Legal->isUniform(Ptr))
return scalarizeInstruction(Instr);
// If the pointer is non-consecutive and gather/scatter is not supported
// scalarize the instruction.
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
bool Reverse = ConsecutiveStride < 0;
bool UniformLoad = LI && Legal->isUniform(Ptr);
if (!ConsecutiveStride || UniformLoad)
bool CreateGatherScatter = !ConsecutiveStride &&
((LI && Legal->isLegalMaskedGather(ScalarDataTy)) ||
(SI && Legal->isLegalMaskedScatter(ScalarDataTy)));
if (!ConsecutiveStride && !CreateGatherScatter)
return scalarizeInstruction(Instr);
Constant *Zero = Builder.getInt32(0);
VectorParts &Entry = WidenMap.get(Instr);
VectorParts VectorGep;
// Handle consecutive loads/stores.
GetElementPtrInst *Gep = getGEPInstruction(Ptr);
if (ConsecutiveStride) {
if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
setDebugLocFromInst(Builder, Gep);
Value *PtrOperand = Gep->getPointerOperand();
@ -2408,7 +2428,6 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
assert(PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getPointerOperand()),
OrigLoop) &&
"Base ptr must be invariant");
// The last index does not have to be the induction. It can be
// consecutive and be a function of the index. For example A[I+1];
unsigned NumOperands = Gep->getNumOperands();
@ -2436,13 +2455,42 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
}
}
Ptr = Builder.Insert(Gep2);
} else {
} else { // No GEP
// Use the induction element ptr.
assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
setDebugLocFromInst(Builder, Ptr);
VectorParts &PtrVal = getVectorValue(Ptr);
Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
}
} else {
// At this point we should vector version of GEP for Gather or Scatter
assert(CreateGatherScatter && "The instruction should be scalarized");
if (Gep) {
SmallVector<VectorParts, 4> OpsV;
// Vectorizing GEP, across UF parts, we want to keep each loop-invariant
// base or index of GEP scalar
for (Value *Op : Gep->operands()) {
if (PSE.getSE()->isLoopInvariant(PSE.getSCEV(Op), OrigLoop))
OpsV.push_back(VectorParts(UF, Op));
else
OpsV.push_back(getVectorValue(Op));
}
for (unsigned Part = 0; Part < UF; ++Part) {
SmallVector<Value*, 4> Ops;
Value *GEPBasePtr = OpsV[0][Part];
for (unsigned i = 1; i < Gep->getNumOperands(); i++)
Ops.push_back(OpsV[i][Part]);
Value *NewGep = Builder.CreateGEP(nullptr, GEPBasePtr, Ops,
"VectorGep");
assert(NewGep->getType()->isVectorTy() && "Expected vector GEP");
NewGep = Builder.CreateBitCast(NewGep,
VectorType::get(Ptr->getType(), VF));
VectorGep.push_back(NewGep);
}
} else
VectorGep = getVectorValue(Ptr);
}
VectorParts Mask = createBlockInMask(Instr->getParent());
// Handle Stores:
@ -2455,6 +2503,12 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
VectorParts StoredVal = getVectorValue(SI->getValueOperand());
for (unsigned Part = 0; Part < UF; ++Part) {
Instruction *NewSI = nullptr;
if (CreateGatherScatter) {
Value *MaskPart = Legal->isMaskRequired(SI) ? Mask[Part] : nullptr;
NewSI = Builder.CreateMaskedScatter(StoredVal[Part], VectorGep[Part],
Alignment, MaskPart);
} else {
// Calculate the pointer for the specific unroll-part.
Value *PartPtr =
Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
@ -2473,12 +2527,13 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
Value *VecPtr = Builder.CreateBitCast(PartPtr,
DataTy->getPointerTo(AddressSpace));
Instruction *NewSI;
if (Legal->isMaskRequired(SI))
NewSI = Builder.CreateMaskedStore(StoredVal[Part], VecPtr, Alignment,
Mask[Part]);
else
NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr, Alignment);
NewSI = Builder.CreateAlignedStore(StoredVal[Part], VecPtr,
Alignment);
}
propagateMetadata(NewSI, SI);
}
return;
@ -2488,6 +2543,13 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
assert(LI && "Must have a load instruction");
setDebugLocFromInst(Builder, LI);
for (unsigned Part = 0; Part < UF; ++Part) {
Instruction* NewLI;
if (CreateGatherScatter) {
Value *MaskPart = Legal->isMaskRequired(LI) ? Mask[Part] : nullptr;
NewLI = Builder.CreateMaskedGather(VectorGep[Part], Alignment,
MaskPart, 0, "wide.masked.gather");
Entry[Part] = NewLI;
} else {
// Calculate the pointer for the specific unroll-part.
Value *PartPtr =
Builder.CreateGEP(nullptr, Ptr, Builder.getInt32(Part * VF));
@ -2500,7 +2562,6 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
Mask[Part] = reverseVector(Mask[Part]);
}
Instruction* NewLI;
Value *VecPtr = Builder.CreateBitCast(PartPtr,
DataTy->getPointerTo(AddressSpace));
if (Legal->isMaskRequired(LI))
@ -2509,9 +2570,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
"wide.masked.load");
else
NewLI = Builder.CreateAlignedLoad(VecPtr, Alignment, "wide.load");
propagateMetadata(NewLI, LI);
Entry[Part] = Reverse ? reverseVector(NewLI) : NewLI;
}
propagateMetadata(NewLI, LI);
}
}
void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
@ -4520,7 +4582,8 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
if (!LI)
return false;
if (!SafePtrs.count(LI->getPointerOperand())) {
if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand())) {
if (isLegalMaskedLoad(LI->getType(), LI->getPointerOperand()) ||
isLegalMaskedGather(LI->getType())) {
MaskedOp.insert(LI);
continue;
}
@ -4545,7 +4608,8 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB,
// scalarize the block.
bool isLegalMaskedOp =
isLegalMaskedStore(SI->getValueOperand()->getType(),
SI->getPointerOperand());
SI->getPointerOperand()) ||
isLegalMaskedScatter(SI->getValueOperand()->getType());
if (isLegalMaskedOp) {
--NumPredStores;
MaskedOp.insert(SI);
@ -5281,6 +5345,19 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
return Cost;
}
/// \brief Check if the load/store instruction \p I may be translated into
/// gather/scatter during vectorization.
///
/// Pointer \p Ptr specifies address in memory for the given scalar memory
/// instruction. We need it to retrieve data type.
/// Using gather/scatter is possible when it is supported by target.
static bool isGatherOrScatterLegal(Instruction *I, Value *Ptr,
LoopVectorizationLegality *Legal) {
Type *DataTy = cast<PointerType>(Ptr->getType())->getElementType();
return (isa<LoadInst>(I) && Legal->isLegalMaskedGather(DataTy)) ||
(isa<StoreInst>(I) && Legal->isLegalMaskedScatter(DataTy));
}
/// \brief Check whether the address computation for a non-consecutive memory
/// access looks like an unlikely candidate for being merged into the indexing
/// mode.
@ -5500,11 +5577,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
// Scalarized loads/stores.
int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
bool UseGatherOrScatter = (ConsecutiveStride == 0) &&
isGatherOrScatterLegal(I, Ptr, Legal);
bool Reverse = ConsecutiveStride < 0;
const DataLayout &DL = I->getModule()->getDataLayout();
unsigned ScalarAllocatedSize = DL.getTypeAllocSize(ValTy);
unsigned VectorElementSize = DL.getTypeStoreSize(VectorTy) / VF;
if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
if ((!ConsecutiveStride && !UseGatherOrScatter) ||
ScalarAllocatedSize != VectorElementSize) {
bool IsComplexComputation =
isLikelyComplexAddressComputation(Ptr, Legal, SE, TheLoop);
unsigned Cost = 0;
@ -5528,8 +5609,15 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
return Cost;
}
// Wide load/stores.
unsigned Cost = TTI.getAddressComputationCost(VectorTy);
if (UseGatherOrScatter) {
assert(ConsecutiveStride == 0 &&
"Gather/Scatter are not used for consecutive stride");
return Cost +
TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
Legal->isMaskRequired(I), Alignment);
}
// Wide load/stores.
if (Legal->isMaskRequired(I))
Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment,
AS);

View File

@ -0,0 +1,236 @@
; RUN: opt < %s -O3 -mcpu=knl -S | FileCheck %s -check-prefix=AVX512
;AVX1-NOT: llvm.masked
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-pc_linux"
; The source code:
;
;void foo1(float * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger, int * __restrict__ index) {
;
; for (int i=0; i < SIZE; ++i) {
; if (trigger[i] > 0) {
; out[i] = in[index[i]] + (float) 0.5;
; }
; }
;}
;AVX512-LABEL: @foo1
;AVX512: llvm.masked.load.v8i32
;AVX512: llvm.masked.gather.v8f32
;AVX512: llvm.masked.store.v8f32
;AVX512: ret void
; Function Attrs: nounwind uwtable
define void @foo1(float* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) {
entry:
%in.addr = alloca float*, align 8
%out.addr = alloca float*, align 8
%trigger.addr = alloca i32*, align 8
%index.addr = alloca i32*, align 8
%i = alloca i32, align 4
store float* %in, float** %in.addr, align 8
store float* %out, float** %out.addr, align 8
store i32* %trigger, i32** %trigger.addr, align 8
store i32* %index, i32** %index.addr, align 8
store i32 0, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%0 = load i32, i32* %i, align 4
%cmp = icmp slt i32 %0, 4096
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%1 = load i32, i32* %i, align 4
%idxprom = sext i32 %1 to i64
%2 = load i32*, i32** %trigger.addr, align 8
%arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
%3 = load i32, i32* %arrayidx, align 4
%cmp1 = icmp sgt i32 %3, 0
br i1 %cmp1, label %if.then, label %if.end
if.then: ; preds = %for.body
%4 = load i32, i32* %i, align 4
%idxprom2 = sext i32 %4 to i64
%5 = load i32*, i32** %index.addr, align 8
%arrayidx3 = getelementptr inbounds i32, i32* %5, i64 %idxprom2
%6 = load i32, i32* %arrayidx3, align 4
%idxprom4 = sext i32 %6 to i64
%7 = load float*, float** %in.addr, align 8
%arrayidx5 = getelementptr inbounds float, float* %7, i64 %idxprom4
%8 = load float, float* %arrayidx5, align 4
%add = fadd float %8, 5.000000e-01
%9 = load i32, i32* %i, align 4
%idxprom6 = sext i32 %9 to i64
%10 = load float*, float** %out.addr, align 8
%arrayidx7 = getelementptr inbounds float, float* %10, i64 %idxprom6
store float %add, float* %arrayidx7, align 4
br label %if.end
if.end: ; preds = %if.then, %for.body
br label %for.inc
for.inc: ; preds = %if.end
%11 = load i32, i32* %i, align 4
%inc = add nsw i32 %11, 1
store i32 %inc, i32* %i, align 4
br label %for.cond
for.end: ; preds = %for.cond
ret void
}
; The source code
;void foo2 (In * __restrict__ in, float * __restrict__ out, int * __restrict__ trigger) {
;
; for (int i=0; i<SIZE; ++i) {
; if (trigger[i] > 0) {
; out[i] = in[i].b + (float) 0.5;
; }
; }
;}
%struct.In = type { float, float }
;AVX512-LABEL: @foo2
;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %induction, i32 1
;AVX512: llvm.masked.gather.v16f32
;AVX512: llvm.masked.store.v16f32
;AVX512: ret void
define void @foo2(%struct.In* noalias %in, float* noalias %out, i32* noalias %trigger, i32* noalias %index) #0 {
entry:
%in.addr = alloca %struct.In*, align 8
%out.addr = alloca float*, align 8
%trigger.addr = alloca i32*, align 8
%index.addr = alloca i32*, align 8
%i = alloca i32, align 4
store %struct.In* %in, %struct.In** %in.addr, align 8
store float* %out, float** %out.addr, align 8
store i32* %trigger, i32** %trigger.addr, align 8
store i32* %index, i32** %index.addr, align 8
store i32 0, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%0 = load i32, i32* %i, align 4
%cmp = icmp slt i32 %0, 4096
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%1 = load i32, i32* %i, align 4
%idxprom = sext i32 %1 to i64
%2 = load i32*, i32** %trigger.addr, align 8
%arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
%3 = load i32, i32* %arrayidx, align 4
%cmp1 = icmp sgt i32 %3, 0
br i1 %cmp1, label %if.then, label %if.end
if.then: ; preds = %for.body
%4 = load i32, i32* %i, align 4
%idxprom2 = sext i32 %4 to i64
%5 = load %struct.In*, %struct.In** %in.addr, align 8
%arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2
%b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1
%6 = load float, float* %b, align 4
%add = fadd float %6, 5.000000e-01
%7 = load i32, i32* %i, align 4
%idxprom4 = sext i32 %7 to i64
%8 = load float*, float** %out.addr, align 8
%arrayidx5 = getelementptr inbounds float, float* %8, i64 %idxprom4
store float %add, float* %arrayidx5, align 4
br label %if.end
if.end: ; preds = %if.then, %for.body
br label %for.inc
for.inc: ; preds = %if.end
%9 = load i32, i32* %i, align 4
%inc = add nsw i32 %9, 1
store i32 %inc, i32* %i, align 4
br label %for.cond
for.end: ; preds = %for.cond
ret void
}
; The source code
;struct Out {
; float a;
; float b;
;};
;void foo3 (In * __restrict__ in, Out * __restrict__ out, int * __restrict__ trigger) {
;
; for (int i=0; i<SIZE; ++i) {
; if (trigger[i] > 0) {
; out[i].b = in[i].b + (float) 0.5;
; }
; }
;}
;AVX512-LABEL: @foo3
;AVX512: getelementptr %struct.In, %struct.In* %in, <16 x i64> %induction, i32 1
;AVX512: llvm.masked.gather.v16f32
;AVX512: fadd <16 x float>
;AVX512: getelementptr %struct.Out, %struct.Out* %out, <16 x i64> %induction, i32 1
;AVX512: llvm.masked.scatter.v16f32
;AVX512: ret void
%struct.Out = type { float, float }
define void @foo3(%struct.In* noalias %in, %struct.Out* noalias %out, i32* noalias %trigger) {
entry:
%in.addr = alloca %struct.In*, align 8
%out.addr = alloca %struct.Out*, align 8
%trigger.addr = alloca i32*, align 8
%i = alloca i32, align 4
store %struct.In* %in, %struct.In** %in.addr, align 8
store %struct.Out* %out, %struct.Out** %out.addr, align 8
store i32* %trigger, i32** %trigger.addr, align 8
store i32 0, i32* %i, align 4
br label %for.cond
for.cond: ; preds = %for.inc, %entry
%0 = load i32, i32* %i, align 4
%cmp = icmp slt i32 %0, 4096
br i1 %cmp, label %for.body, label %for.end
for.body: ; preds = %for.cond
%1 = load i32, i32* %i, align 4
%idxprom = sext i32 %1 to i64
%2 = load i32*, i32** %trigger.addr, align 8
%arrayidx = getelementptr inbounds i32, i32* %2, i64 %idxprom
%3 = load i32, i32* %arrayidx, align 4
%cmp1 = icmp sgt i32 %3, 0
br i1 %cmp1, label %if.then, label %if.end
if.then: ; preds = %for.body
%4 = load i32, i32* %i, align 4
%idxprom2 = sext i32 %4 to i64
%5 = load %struct.In*, %struct.In** %in.addr, align 8
%arrayidx3 = getelementptr inbounds %struct.In, %struct.In* %5, i64 %idxprom2
%b = getelementptr inbounds %struct.In, %struct.In* %arrayidx3, i32 0, i32 1
%6 = load float, float* %b, align 4
%add = fadd float %6, 5.000000e-01
%7 = load i32, i32* %i, align 4
%idxprom4 = sext i32 %7 to i64
%8 = load %struct.Out*, %struct.Out** %out.addr, align 8
%arrayidx5 = getelementptr inbounds %struct.Out, %struct.Out* %8, i64 %idxprom4
%b6 = getelementptr inbounds %struct.Out, %struct.Out* %arrayidx5, i32 0, i32 1
store float %add, float* %b6, align 4
br label %if.end
if.end: ; preds = %if.then, %for.body
br label %for.inc
for.inc: ; preds = %if.end
%9 = load i32, i32* %i, align 4
%inc = add nsw i32 %9, 1
store i32 %inc, i32* %i, align 4
br label %for.cond
for.end: ; preds = %for.cond
ret void
}
declare void @llvm.masked.scatter.v16f32(<16 x float>, <16 x float*>, i32, <16 x i1>)

View File

@ -278,7 +278,8 @@ for.end: ; preds = %for.cond
;AVX: ret void
;AVX512-LABEL: @foo4
;AVX512-NOT: llvm.masked
;AVX512-NOT: llvm.masked.load
;AVX512: llvm.masked.gather
;AVX512: ret void
; Function Attrs: nounwind uwtable