mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2024-11-26 04:32:44 +01:00
[SLP] Vectorize the index computations of getelementptr instructions.
This patch seeds the SLP vectorizer with getelementptr indices. The primary motivation in doing so is to vectorize gather-like idioms beginning with consecutive loads (e.g., g[a[0] - b[0]] + g[a[1] - b[1]] + ...). While these cases could be vectorized with a top-down phase, seeding the existing bottom-up phase with the index computations avoids the complexity, compile-time, and phase ordering issues associated with a full top-down pass. Only bundles of single-index getelementptrs with non-constant differences are considered for vectorization. Differential Revision: http://reviews.llvm.org/D14829 llvm-svn: 257800
This commit is contained in:
parent
e79b0c88ff
commit
b2378417a2
@ -412,6 +412,13 @@ public:
|
||||
return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder;
|
||||
}
|
||||
|
||||
/// \return The vector element size in bits to use when vectorizing the
|
||||
/// expression tree ending at \p V. If V is a store, the size is the width of
|
||||
/// the stored value. Otherwise, the size is the width of the largest loaded
|
||||
/// value reaching V. This method is used by the vectorizer to calculate
|
||||
/// vectorization factors.
|
||||
unsigned getVectorElementSize(Value *V);
|
||||
|
||||
private:
|
||||
struct TreeEntry;
|
||||
|
||||
@ -3139,10 +3146,73 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
|
||||
BS->ScheduleStart = nullptr;
|
||||
}
|
||||
|
||||
unsigned BoUpSLP::getVectorElementSize(Value *V) {
|
||||
auto &DL = F->getParent()->getDataLayout();
|
||||
|
||||
// If V is a store, just return the width of the stored value without
|
||||
// traversing the expression tree. This is the common case.
|
||||
if (auto *Store = dyn_cast<StoreInst>(V))
|
||||
return DL.getTypeSizeInBits(Store->getValueOperand()->getType());
|
||||
|
||||
// If V is not a store, we can traverse the expression tree to find loads
|
||||
// that feed it. The type of the loaded value may indicate a more suitable
|
||||
// width than V's type. We want to base the vector element size on the width
|
||||
// of memory operations where possible.
|
||||
SmallVector<Instruction *, 16> Worklist;
|
||||
SmallPtrSet<Instruction *, 16> Visited;
|
||||
if (auto *I = dyn_cast<Instruction>(V))
|
||||
Worklist.push_back(I);
|
||||
|
||||
// Traverse the expression tree in bottom-up order looking for loads. If we
|
||||
// encounter an instruciton we don't yet handle, we give up.
|
||||
auto MaxWidth = 0u;
|
||||
auto FoundUnknownInst = false;
|
||||
while (!Worklist.empty() && !FoundUnknownInst) {
|
||||
auto *I = Worklist.pop_back_val();
|
||||
Visited.insert(I);
|
||||
|
||||
// We should only be looking at scalar instructions here. If the current
|
||||
// instruction has a vector type, give up.
|
||||
auto *Ty = I->getType();
|
||||
if (isa<VectorType>(Ty))
|
||||
FoundUnknownInst = true;
|
||||
|
||||
// If the current instruction is a load, update MaxWidth to reflect the
|
||||
// width of the loaded value.
|
||||
else if (isa<LoadInst>(I))
|
||||
MaxWidth = std::max(MaxWidth, (unsigned)DL.getTypeSizeInBits(Ty));
|
||||
|
||||
// Otherwise, we need to visit the operands of the instruction. We only
|
||||
// handle the interesting cases from buildTree here. If an operand is an
|
||||
// instruction we haven't yet visited, we add it to the worklist.
|
||||
else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
|
||||
isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
|
||||
for (Use &U : I->operands())
|
||||
if (auto *J = dyn_cast<Instruction>(U.get()))
|
||||
if (!Visited.count(J))
|
||||
Worklist.push_back(J);
|
||||
}
|
||||
|
||||
// If we don't yet handle the instruction, give up.
|
||||
else
|
||||
FoundUnknownInst = true;
|
||||
}
|
||||
|
||||
// If we didn't encounter a memory access in the expression tree, or if we
|
||||
// gave up for some reason, just return the width of V.
|
||||
if (!MaxWidth || FoundUnknownInst)
|
||||
return DL.getTypeSizeInBits(V->getType());
|
||||
|
||||
// Otherwise, return the maximum width we found.
|
||||
return MaxWidth;
|
||||
}
|
||||
|
||||
/// The SLPVectorizer Pass.
|
||||
struct SLPVectorizer : public FunctionPass {
|
||||
typedef SmallVector<StoreInst *, 8> StoreList;
|
||||
typedef MapVector<Value *, StoreList> StoreListMap;
|
||||
typedef SmallVector<WeakVH, 8> WeakVHList;
|
||||
typedef MapVector<Value *, WeakVHList> WeakVHListMap;
|
||||
|
||||
/// Pass identification, replacement for typeid
|
||||
static char ID;
|
||||
@ -3172,7 +3242,8 @@ struct SLPVectorizer : public FunctionPass {
|
||||
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
||||
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
|
||||
|
||||
StoreRefs.clear();
|
||||
Stores.clear();
|
||||
GEPs.clear();
|
||||
bool Changed = false;
|
||||
|
||||
// If the target claims to have no vector registers don't attempt
|
||||
@ -3206,15 +3277,24 @@ struct SLPVectorizer : public FunctionPass {
|
||||
|
||||
// Scan the blocks in the function in post order.
|
||||
for (auto BB : post_order(&F.getEntryBlock())) {
|
||||
collectSeedInstructions(BB);
|
||||
|
||||
// Vectorize trees that end at stores.
|
||||
if (unsigned count = collectStores(BB, R)) {
|
||||
(void)count;
|
||||
DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n");
|
||||
if (NumStores > 0) {
|
||||
DEBUG(dbgs() << "SLP: Found " << NumStores << " stores.\n");
|
||||
Changed |= vectorizeStoreChains(R);
|
||||
}
|
||||
|
||||
// Vectorize trees that end at reductions.
|
||||
Changed |= vectorizeChainsInBlock(BB, R);
|
||||
|
||||
// Vectorize the index computations of getelementptr instructions. This
|
||||
// is primarily intended to catch gather-like idioms ending at
|
||||
// non-consecutive loads.
|
||||
if (NumGEPs > 0) {
|
||||
DEBUG(dbgs() << "SLP: Found " << NumGEPs << " GEPs.\n");
|
||||
Changed |= vectorizeGEPIndices(BB, R);
|
||||
}
|
||||
}
|
||||
|
||||
if (Changed) {
|
||||
@ -3241,12 +3321,14 @@ struct SLPVectorizer : public FunctionPass {
|
||||
}
|
||||
|
||||
private:
|
||||
|
||||
/// \brief Collect memory references and sort them according to their base
|
||||
/// object. We sort the stores to their base objects to reduce the cost of the
|
||||
/// quadratic search on the stores. TODO: We can further reduce this cost
|
||||
/// if we flush the chain creation every time we run into a memory barrier.
|
||||
unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
|
||||
/// \brief Collect store and getelementptr instructions and organize them
|
||||
/// according to the underlying object of their pointer operands. We sort the
|
||||
/// instructions by their underlying objects to reduce the cost of
|
||||
/// consecutive access queries.
|
||||
///
|
||||
/// TODO: We can further reduce this cost if we flush the chain creation
|
||||
/// every time we run into a memory barrier.
|
||||
void collectSeedInstructions(BasicBlock *BB);
|
||||
|
||||
/// \brief Try to vectorize a chain that starts at two arithmetic instrs.
|
||||
bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R);
|
||||
@ -3262,9 +3344,13 @@ private:
|
||||
/// \brief Try to vectorize a chain that may start at the operands of \V;
|
||||
bool tryToVectorize(BinaryOperator *V, BoUpSLP &R);
|
||||
|
||||
/// \brief Vectorize the stores that were collected in StoreRefs.
|
||||
/// \brief Vectorize the store instructions collected in Stores.
|
||||
bool vectorizeStoreChains(BoUpSLP &R);
|
||||
|
||||
/// \brief Vectorize the index computations of the getelementptr instructions
|
||||
/// collected in GEPs.
|
||||
bool vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R);
|
||||
|
||||
/// \brief Scan the basic block and look for patterns that are likely to start
|
||||
/// a vectorization chain.
|
||||
bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R);
|
||||
@ -3274,8 +3360,19 @@ private:
|
||||
|
||||
bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold,
|
||||
BoUpSLP &R);
|
||||
private:
|
||||
StoreListMap StoreRefs;
|
||||
|
||||
/// The store instructions in a basic block organized by base pointer.
|
||||
StoreListMap Stores;
|
||||
|
||||
/// The getelementptr instructions in a basic block organized by base pointer.
|
||||
WeakVHListMap GEPs;
|
||||
|
||||
/// The number of store instructions in a basic block.
|
||||
unsigned NumStores;
|
||||
|
||||
/// The number of getelementptr instructions in a basic block.
|
||||
unsigned NumGEPs;
|
||||
|
||||
unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
|
||||
};
|
||||
|
||||
@ -3296,9 +3393,7 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef<Value *> Chain,
|
||||
unsigned ChainLen = Chain.size();
|
||||
DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen
|
||||
<< "\n");
|
||||
Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
|
||||
auto &DL = cast<StoreInst>(Chain[0])->getModule()->getDataLayout();
|
||||
unsigned Sz = DL.getTypeSizeInBits(StoreTy);
|
||||
unsigned Sz = R.getVectorElementSize(Chain[0]);
|
||||
unsigned VF = VecRegSize / Sz;
|
||||
|
||||
if (!isPowerOf2_32(Sz) || VF < 2)
|
||||
@ -3409,33 +3504,43 @@ bool SLPVectorizer::vectorizeStores(ArrayRef<StoreInst *> Stores,
|
||||
return Changed;
|
||||
}
|
||||
|
||||
void SLPVectorizer::collectSeedInstructions(BasicBlock *BB) {
|
||||
|
||||
unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
|
||||
unsigned count = 0;
|
||||
StoreRefs.clear();
|
||||
// Initialize the collections. We will make a single pass over the block.
|
||||
Stores.clear();
|
||||
GEPs.clear();
|
||||
NumStores = NumGEPs = 0;
|
||||
const DataLayout &DL = BB->getModule()->getDataLayout();
|
||||
|
||||
// Visit the store and getelementptr instructions in BB and organize them in
|
||||
// Stores and GEPs according to the underlying objects of their pointer
|
||||
// operands.
|
||||
for (Instruction &I : *BB) {
|
||||
StoreInst *SI = dyn_cast<StoreInst>(&I);
|
||||
if (!SI)
|
||||
continue;
|
||||
|
||||
// Don't touch volatile stores.
|
||||
if (!SI->isSimple())
|
||||
continue;
|
||||
// Ignore store instructions that are volatile or have a pointer operand
|
||||
// that doesn't point to a scalar type.
|
||||
if (auto *SI = dyn_cast<StoreInst>(&I)) {
|
||||
if (!SI->isSimple())
|
||||
continue;
|
||||
if (!isValidElementType(SI->getValueOperand()->getType()))
|
||||
continue;
|
||||
Stores[GetUnderlyingObject(SI->getPointerOperand(), DL)].push_back(SI);
|
||||
++NumStores;
|
||||
}
|
||||
|
||||
// Check that the pointer points to scalars.
|
||||
Type *Ty = SI->getValueOperand()->getType();
|
||||
if (!isValidElementType(Ty))
|
||||
continue;
|
||||
|
||||
// Find the base pointer.
|
||||
Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL);
|
||||
|
||||
// Save the store locations.
|
||||
StoreRefs[Ptr].push_back(SI);
|
||||
count++;
|
||||
// Ignore getelementptr instructions that have more than one index, a
|
||||
// constant index, or a pointer operand that doesn't point to a scalar
|
||||
// type.
|
||||
else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
|
||||
auto Idx = GEP->idx_begin()->get();
|
||||
if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
|
||||
continue;
|
||||
if (!isValidElementType(Idx->getType()))
|
||||
continue;
|
||||
GEPs[GetUnderlyingObject(GEP->getPointerOperand(), DL)].push_back(GEP);
|
||||
++NumGEPs;
|
||||
}
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
|
||||
@ -3459,12 +3564,10 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
|
||||
return false;
|
||||
|
||||
unsigned Opcode0 = I0->getOpcode();
|
||||
const DataLayout &DL = I0->getModule()->getDataLayout();
|
||||
|
||||
Type *Ty0 = I0->getType();
|
||||
unsigned Sz = DL.getTypeSizeInBits(Ty0);
|
||||
// FIXME: Register size should be a parameter to this function, so we can
|
||||
// try different vectorization factors.
|
||||
unsigned Sz = R.getVectorElementSize(I0);
|
||||
unsigned VF = MinVecRegSize / Sz;
|
||||
|
||||
for (Value *V : VL) {
|
||||
@ -4183,10 +4286,83 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
|
||||
return Changed;
|
||||
}
|
||||
|
||||
bool SLPVectorizer::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
|
||||
auto Changed = false;
|
||||
for (auto &Entry : GEPs) {
|
||||
auto &GEPList = Entry.second;
|
||||
|
||||
// If the getelementptr list has fewer than two elements, there's nothing
|
||||
// to do.
|
||||
if (GEPList.size() < 2)
|
||||
continue;
|
||||
|
||||
DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
|
||||
<< GEPList.size() << ".\n");
|
||||
|
||||
// Initialize a set a candidate getelementptrs. Note that we use a
|
||||
// SetVector here to preserve program order. If the index computations are
|
||||
// vectorizable and begin with loads, we want to minimize the chance of
|
||||
// having to reorder them later.
|
||||
SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
|
||||
|
||||
// Some of the candidates may have already been vectorized after we
|
||||
// initially collected them. If so, the WeakVHs will have nullified the
|
||||
// values, so remove them from the set of candidates.
|
||||
Candidates.remove(nullptr);
|
||||
|
||||
// Remove from the set of candidates all pairs of getelementptrs with
|
||||
// constant differences. Such getelementptrs are likely not good candidates
|
||||
// for vectorization in a bottom-up phase since one can be computed from
|
||||
// the other.
|
||||
for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
|
||||
auto *GEP = SE->getSCEV(GEPList[I]);
|
||||
for (int J = I + 1; J < E && Candidates.size() > 1; ++J)
|
||||
if (isa<SCEVConstant>(SE->getMinusSCEV(GEP, SE->getSCEV(GEPList[J])))) {
|
||||
Candidates.remove(GEPList[I]);
|
||||
Candidates.remove(GEPList[J]);
|
||||
}
|
||||
}
|
||||
|
||||
// We break out of the above computation as soon as we know there are fewer
|
||||
// than two candidates remaining.
|
||||
if (Candidates.size() < 2)
|
||||
continue;
|
||||
|
||||
// Add the single, non-constant index of each candidate to the bundle. We
|
||||
// ensured the indices met these constraints when we originally collected
|
||||
// the getelementptrs.
|
||||
SmallVector<Value *, 16> Bundle(Candidates.size());
|
||||
auto BundleIndex = 0u;
|
||||
for (auto *V : Candidates) {
|
||||
auto *GEP = cast<GetElementPtrInst>(V);
|
||||
auto *GEPIdx = GEP->idx_begin()->get();
|
||||
assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
|
||||
Bundle[BundleIndex++] = GEPIdx;
|
||||
}
|
||||
|
||||
// Try and vectorize the indices. We are currently only interested in
|
||||
// gather-like cases of the form:
|
||||
//
|
||||
// ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
|
||||
//
|
||||
// where the loads of "a", the loads of "b", and the subtractions can be
|
||||
// performed in parallel. It's likely that detecting this pattern in a
|
||||
// bottom-up phase will be simpler and less costly than building a
|
||||
// full-blown top-down phase beginning at the consecutive loads. We process
|
||||
// the bundle in chunks of 16 (like we do for stores) to minimize
|
||||
// compile-time.
|
||||
for (unsigned BI = 0, BE = Bundle.size(); BI < BE; BI += 16) {
|
||||
auto Len = std::min<unsigned>(BE - BI, 16);
|
||||
Changed |= tryToVectorizeList(makeArrayRef(&Bundle[BI], Len), R);
|
||||
}
|
||||
}
|
||||
return Changed;
|
||||
}
|
||||
|
||||
bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
|
||||
bool Changed = false;
|
||||
// Attempt to sort and vectorize each of the store-groups.
|
||||
for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
|
||||
for (StoreListMap::iterator it = Stores.begin(), e = Stores.end();
|
||||
it != e; ++it) {
|
||||
if (it->second.size() < 2)
|
||||
continue;
|
||||
|
258
test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
Normal file
258
test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
Normal file
@ -0,0 +1,258 @@
|
||||
; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s
|
||||
|
||||
target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
|
||||
target triple = "aarch64--linux-gnu"
|
||||
|
||||
; These tests check that we vectorize the index calculations in the
|
||||
; gather-reduce pattern shown below. We check cases having i32 and i64
|
||||
; subtraction.
|
||||
;
|
||||
; int gather_reduce_8x16(short *a, short *b, short *g, int n) {
|
||||
; int sum = 0;
|
||||
; for (int i = 0; i < n ; ++i) {
|
||||
; sum += g[*a++ - b[0]]; sum += g[*a++ - b[1]];
|
||||
; sum += g[*a++ - b[2]]; sum += g[*a++ - b[3]];
|
||||
; sum += g[*a++ - b[4]]; sum += g[*a++ - b[5]];
|
||||
; sum += g[*a++ - b[6]]; sum += g[*a++ - b[7]];
|
||||
; }
|
||||
; return sum;
|
||||
; }
|
||||
|
||||
; CHECK-LABEL: @gather_reduce_8x16_i32
|
||||
;
|
||||
; CHECK: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16>
|
||||
; CHECK: zext <8 x i16> [[L]] to <8 x i32>
|
||||
; CHECK: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32>
|
||||
; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]]
|
||||
; CHECK: sext i32 [[X]] to i64
|
||||
;
|
||||
define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
|
||||
entry:
|
||||
%cmp.99 = icmp sgt i32 %n, 0
|
||||
br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup
|
||||
|
||||
for.body.preheader:
|
||||
br label %for.body
|
||||
|
||||
for.cond.cleanup.loopexit:
|
||||
br label %for.cond.cleanup
|
||||
|
||||
for.cond.cleanup:
|
||||
%sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ]
|
||||
ret i32 %sum.0.lcssa
|
||||
|
||||
for.body:
|
||||
%i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
|
||||
%sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ]
|
||||
%a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ]
|
||||
%incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1
|
||||
%0 = load i16, i16* %a.addr.0101, align 2
|
||||
%conv = zext i16 %0 to i32
|
||||
%incdec.ptr1 = getelementptr inbounds i16, i16* %b, i64 1
|
||||
%1 = load i16, i16* %b, align 2
|
||||
%conv2 = zext i16 %1 to i32
|
||||
%sub = sub nsw i32 %conv, %conv2
|
||||
%arrayidx = getelementptr inbounds i16, i16* %g, i32 %sub
|
||||
%2 = load i16, i16* %arrayidx, align 2
|
||||
%conv3 = zext i16 %2 to i32
|
||||
%add = add nsw i32 %conv3, %sum.0102
|
||||
%incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2
|
||||
%3 = load i16, i16* %incdec.ptr, align 2
|
||||
%conv5 = zext i16 %3 to i32
|
||||
%incdec.ptr6 = getelementptr inbounds i16, i16* %b, i64 2
|
||||
%4 = load i16, i16* %incdec.ptr1, align 2
|
||||
%conv7 = zext i16 %4 to i32
|
||||
%sub8 = sub nsw i32 %conv5, %conv7
|
||||
%arrayidx10 = getelementptr inbounds i16, i16* %g, i32 %sub8
|
||||
%5 = load i16, i16* %arrayidx10, align 2
|
||||
%conv11 = zext i16 %5 to i32
|
||||
%add12 = add nsw i32 %add, %conv11
|
||||
%incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3
|
||||
%6 = load i16, i16* %incdec.ptr4, align 2
|
||||
%conv14 = zext i16 %6 to i32
|
||||
%incdec.ptr15 = getelementptr inbounds i16, i16* %b, i64 3
|
||||
%7 = load i16, i16* %incdec.ptr6, align 2
|
||||
%conv16 = zext i16 %7 to i32
|
||||
%sub17 = sub nsw i32 %conv14, %conv16
|
||||
%arrayidx19 = getelementptr inbounds i16, i16* %g, i32 %sub17
|
||||
%8 = load i16, i16* %arrayidx19, align 2
|
||||
%conv20 = zext i16 %8 to i32
|
||||
%add21 = add nsw i32 %add12, %conv20
|
||||
%incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4
|
||||
%9 = load i16, i16* %incdec.ptr13, align 2
|
||||
%conv23 = zext i16 %9 to i32
|
||||
%incdec.ptr24 = getelementptr inbounds i16, i16* %b, i64 4
|
||||
%10 = load i16, i16* %incdec.ptr15, align 2
|
||||
%conv25 = zext i16 %10 to i32
|
||||
%sub26 = sub nsw i32 %conv23, %conv25
|
||||
%arrayidx28 = getelementptr inbounds i16, i16* %g, i32 %sub26
|
||||
%11 = load i16, i16* %arrayidx28, align 2
|
||||
%conv29 = zext i16 %11 to i32
|
||||
%add30 = add nsw i32 %add21, %conv29
|
||||
%incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5
|
||||
%12 = load i16, i16* %incdec.ptr22, align 2
|
||||
%conv32 = zext i16 %12 to i32
|
||||
%incdec.ptr33 = getelementptr inbounds i16, i16* %b, i64 5
|
||||
%13 = load i16, i16* %incdec.ptr24, align 2
|
||||
%conv34 = zext i16 %13 to i32
|
||||
%sub35 = sub nsw i32 %conv32, %conv34
|
||||
%arrayidx37 = getelementptr inbounds i16, i16* %g, i32 %sub35
|
||||
%14 = load i16, i16* %arrayidx37, align 2
|
||||
%conv38 = zext i16 %14 to i32
|
||||
%add39 = add nsw i32 %add30, %conv38
|
||||
%incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6
|
||||
%15 = load i16, i16* %incdec.ptr31, align 2
|
||||
%conv41 = zext i16 %15 to i32
|
||||
%incdec.ptr42 = getelementptr inbounds i16, i16* %b, i64 6
|
||||
%16 = load i16, i16* %incdec.ptr33, align 2
|
||||
%conv43 = zext i16 %16 to i32
|
||||
%sub44 = sub nsw i32 %conv41, %conv43
|
||||
%arrayidx46 = getelementptr inbounds i16, i16* %g, i32 %sub44
|
||||
%17 = load i16, i16* %arrayidx46, align 2
|
||||
%conv47 = zext i16 %17 to i32
|
||||
%add48 = add nsw i32 %add39, %conv47
|
||||
%incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7
|
||||
%18 = load i16, i16* %incdec.ptr40, align 2
|
||||
%conv50 = zext i16 %18 to i32
|
||||
%incdec.ptr51 = getelementptr inbounds i16, i16* %b, i64 7
|
||||
%19 = load i16, i16* %incdec.ptr42, align 2
|
||||
%conv52 = zext i16 %19 to i32
|
||||
%sub53 = sub nsw i32 %conv50, %conv52
|
||||
%arrayidx55 = getelementptr inbounds i16, i16* %g, i32 %sub53
|
||||
%20 = load i16, i16* %arrayidx55, align 2
|
||||
%conv56 = zext i16 %20 to i32
|
||||
%add57 = add nsw i32 %add48, %conv56
|
||||
%incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8
|
||||
%21 = load i16, i16* %incdec.ptr49, align 2
|
||||
%conv59 = zext i16 %21 to i32
|
||||
%22 = load i16, i16* %incdec.ptr51, align 2
|
||||
%conv61 = zext i16 %22 to i32
|
||||
%sub62 = sub nsw i32 %conv59, %conv61
|
||||
%arrayidx64 = getelementptr inbounds i16, i16* %g, i32 %sub62
|
||||
%23 = load i16, i16* %arrayidx64, align 2
|
||||
%conv65 = zext i16 %23 to i32
|
||||
%add66 = add nsw i32 %add57, %conv65
|
||||
%inc = add nuw nsw i32 %i.0103, 1
|
||||
%exitcond = icmp eq i32 %inc, %n
|
||||
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @gather_reduce_8x16_i64
|
||||
;
|
||||
; CHECK-NOT: load <8 x i16>
|
||||
;
|
||||
; FIXME: We are currently unable to vectorize the case with i64 subtraction
|
||||
; because the zero extensions are too expensive. The solution here is to
|
||||
; convert the i64 subtractions to i32 subtractions during vectorization.
|
||||
; This would then match the case above.
|
||||
;
|
||||
define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) {
|
||||
entry:
|
||||
%cmp.99 = icmp sgt i32 %n, 0
|
||||
br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup
|
||||
|
||||
for.body.preheader:
|
||||
br label %for.body
|
||||
|
||||
for.cond.cleanup.loopexit:
|
||||
br label %for.cond.cleanup
|
||||
|
||||
for.cond.cleanup:
|
||||
%sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ]
|
||||
ret i32 %sum.0.lcssa
|
||||
|
||||
for.body:
|
||||
%i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
|
||||
%sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ]
|
||||
%a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ]
|
||||
%incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1
|
||||
%0 = load i16, i16* %a.addr.0101, align 2
|
||||
%conv = zext i16 %0 to i64
|
||||
%incdec.ptr1 = getelementptr inbounds i16, i16* %b, i64 1
|
||||
%1 = load i16, i16* %b, align 2
|
||||
%conv2 = zext i16 %1 to i64
|
||||
%sub = sub nsw i64 %conv, %conv2
|
||||
%arrayidx = getelementptr inbounds i16, i16* %g, i64 %sub
|
||||
%2 = load i16, i16* %arrayidx, align 2
|
||||
%conv3 = zext i16 %2 to i32
|
||||
%add = add nsw i32 %conv3, %sum.0102
|
||||
%incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2
|
||||
%3 = load i16, i16* %incdec.ptr, align 2
|
||||
%conv5 = zext i16 %3 to i64
|
||||
%incdec.ptr6 = getelementptr inbounds i16, i16* %b, i64 2
|
||||
%4 = load i16, i16* %incdec.ptr1, align 2
|
||||
%conv7 = zext i16 %4 to i64
|
||||
%sub8 = sub nsw i64 %conv5, %conv7
|
||||
%arrayidx10 = getelementptr inbounds i16, i16* %g, i64 %sub8
|
||||
%5 = load i16, i16* %arrayidx10, align 2
|
||||
%conv11 = zext i16 %5 to i32
|
||||
%add12 = add nsw i32 %add, %conv11
|
||||
%incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3
|
||||
%6 = load i16, i16* %incdec.ptr4, align 2
|
||||
%conv14 = zext i16 %6 to i64
|
||||
%incdec.ptr15 = getelementptr inbounds i16, i16* %b, i64 3
|
||||
%7 = load i16, i16* %incdec.ptr6, align 2
|
||||
%conv16 = zext i16 %7 to i64
|
||||
%sub17 = sub nsw i64 %conv14, %conv16
|
||||
%arrayidx19 = getelementptr inbounds i16, i16* %g, i64 %sub17
|
||||
%8 = load i16, i16* %arrayidx19, align 2
|
||||
%conv20 = zext i16 %8 to i32
|
||||
%add21 = add nsw i32 %add12, %conv20
|
||||
%incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4
|
||||
%9 = load i16, i16* %incdec.ptr13, align 2
|
||||
%conv23 = zext i16 %9 to i64
|
||||
%incdec.ptr24 = getelementptr inbounds i16, i16* %b, i64 4
|
||||
%10 = load i16, i16* %incdec.ptr15, align 2
|
||||
%conv25 = zext i16 %10 to i64
|
||||
%sub26 = sub nsw i64 %conv23, %conv25
|
||||
%arrayidx28 = getelementptr inbounds i16, i16* %g, i64 %sub26
|
||||
%11 = load i16, i16* %arrayidx28, align 2
|
||||
%conv29 = zext i16 %11 to i32
|
||||
%add30 = add nsw i32 %add21, %conv29
|
||||
%incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5
|
||||
%12 = load i16, i16* %incdec.ptr22, align 2
|
||||
%conv32 = zext i16 %12 to i64
|
||||
%incdec.ptr33 = getelementptr inbounds i16, i16* %b, i64 5
|
||||
%13 = load i16, i16* %incdec.ptr24, align 2
|
||||
%conv34 = zext i16 %13 to i64
|
||||
%sub35 = sub nsw i64 %conv32, %conv34
|
||||
%arrayidx37 = getelementptr inbounds i16, i16* %g, i64 %sub35
|
||||
%14 = load i16, i16* %arrayidx37, align 2
|
||||
%conv38 = zext i16 %14 to i32
|
||||
%add39 = add nsw i32 %add30, %conv38
|
||||
%incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6
|
||||
%15 = load i16, i16* %incdec.ptr31, align 2
|
||||
%conv41 = zext i16 %15 to i64
|
||||
%incdec.ptr42 = getelementptr inbounds i16, i16* %b, i64 6
|
||||
%16 = load i16, i16* %incdec.ptr33, align 2
|
||||
%conv43 = zext i16 %16 to i64
|
||||
%sub44 = sub nsw i64 %conv41, %conv43
|
||||
%arrayidx46 = getelementptr inbounds i16, i16* %g, i64 %sub44
|
||||
%17 = load i16, i16* %arrayidx46, align 2
|
||||
%conv47 = zext i16 %17 to i32
|
||||
%add48 = add nsw i32 %add39, %conv47
|
||||
%incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7
|
||||
%18 = load i16, i16* %incdec.ptr40, align 2
|
||||
%conv50 = zext i16 %18 to i64
|
||||
%incdec.ptr51 = getelementptr inbounds i16, i16* %b, i64 7
|
||||
%19 = load i16, i16* %incdec.ptr42, align 2
|
||||
%conv52 = zext i16 %19 to i64
|
||||
%sub53 = sub nsw i64 %conv50, %conv52
|
||||
%arrayidx55 = getelementptr inbounds i16, i16* %g, i64 %sub53
|
||||
%20 = load i16, i16* %arrayidx55, align 2
|
||||
%conv56 = zext i16 %20 to i32
|
||||
%add57 = add nsw i32 %add48, %conv56
|
||||
%incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8
|
||||
%21 = load i16, i16* %incdec.ptr49, align 2
|
||||
%conv59 = zext i16 %21 to i64
|
||||
%22 = load i16, i16* %incdec.ptr51, align 2
|
||||
%conv61 = zext i16 %22 to i64
|
||||
%sub62 = sub nsw i64 %conv59, %conv61
|
||||
%arrayidx64 = getelementptr inbounds i16, i16* %g, i64 %sub62
|
||||
%23 = load i16, i16* %arrayidx64, align 2
|
||||
%conv65 = zext i16 %23 to i32
|
||||
%add66 = add nsw i32 %add57, %conv65
|
||||
%inc = add nuw nsw i32 %i.0103, 1
|
||||
%exitcond = icmp eq i32 %inc, %n
|
||||
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
|
||||
}
|
111
test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
Normal file
111
test/Transforms/SLPVectorizer/AArch64/getelementptr.ll
Normal file
@ -0,0 +1,111 @@
|
||||
; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine < %s | FileCheck %s
|
||||
|
||||
target datalayout = "e-m:e-i32:64-i128:128-n32:64-S128"
|
||||
target triple = "aarch64--linux-gnu"
|
||||
|
||||
; These tests check that we remove from consideration pairs of seed
|
||||
; getelementptrs when they are known to have a constant difference. Such pairs
|
||||
; are likely not good candidates for vectorization since one can be computed
|
||||
; from the other. We use an unprofitable threshold to force vectorization.
|
||||
;
|
||||
; int getelementptr(int *g, int n, int w, int x, int y, int z) {
|
||||
; int sum = 0;
|
||||
; for (int i = 0; i < n ; ++i) {
|
||||
; sum += g[2*i + w]; sum += g[2*i + x];
|
||||
; sum += g[2*i + y]; sum += g[2*i + z];
|
||||
; }
|
||||
; return sum;
|
||||
; }
|
||||
;
|
||||
|
||||
; CHECK-LABEL: @getelementptr_4x32
|
||||
;
|
||||
; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <4 x i32>
|
||||
; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <4 x i32> [[A]]
|
||||
; CHECK: sext i32 [[X]] to i64
|
||||
;
|
||||
define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
|
||||
entry:
|
||||
%cmp31 = icmp sgt i32 %n, 0
|
||||
br i1 %cmp31, label %for.body.preheader, label %for.cond.cleanup
|
||||
|
||||
for.body.preheader:
|
||||
br label %for.body
|
||||
|
||||
for.cond.cleanup.loopexit:
|
||||
br label %for.cond.cleanup
|
||||
|
||||
for.cond.cleanup:
|
||||
%sum.0.lcssa = phi i32 [ 0, %entry ], [ %add16, %for.cond.cleanup.loopexit ]
|
||||
ret i32 %sum.0.lcssa
|
||||
|
||||
for.body:
|
||||
%indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
|
||||
%sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add16, %for.body ]
|
||||
%t4 = shl nsw i32 %indvars.iv, 1
|
||||
%t5 = add nsw i32 %t4, 0
|
||||
%arrayidx = getelementptr inbounds i32, i32* %g, i32 %t5
|
||||
%t6 = load i32, i32* %arrayidx, align 4
|
||||
%add1 = add nsw i32 %t6, %sum.032
|
||||
%t7 = add nsw i32 %t4, %x
|
||||
%arrayidx5 = getelementptr inbounds i32, i32* %g, i32 %t7
|
||||
%t8 = load i32, i32* %arrayidx5, align 4
|
||||
%add6 = add nsw i32 %add1, %t8
|
||||
%t9 = add nsw i32 %t4, %y
|
||||
%arrayidx10 = getelementptr inbounds i32, i32* %g, i32 %t9
|
||||
%t10 = load i32, i32* %arrayidx10, align 4
|
||||
%add11 = add nsw i32 %add6, %t10
|
||||
%t11 = add nsw i32 %t4, %z
|
||||
%arrayidx15 = getelementptr inbounds i32, i32* %g, i32 %t11
|
||||
%t12 = load i32, i32* %arrayidx15, align 4
|
||||
%add16 = add nsw i32 %add11, %t12
|
||||
%indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
|
||||
%exitcond = icmp eq i32 %indvars.iv.next , %n
|
||||
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @getelementptr_2x32
|
||||
;
|
||||
; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <2 x i32>
|
||||
; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[A]]
|
||||
; CHECK: sext i32 [[X]] to i64
|
||||
;
|
||||
define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) {
|
||||
entry:
|
||||
%cmp31 = icmp sgt i32 %n, 0
|
||||
br i1 %cmp31, label %for.body.preheader, label %for.cond.cleanup
|
||||
|
||||
for.body.preheader:
|
||||
br label %for.body
|
||||
|
||||
for.cond.cleanup.loopexit:
|
||||
br label %for.cond.cleanup
|
||||
|
||||
for.cond.cleanup:
|
||||
%sum.0.lcssa = phi i32 [ 0, %entry ], [ %add16, %for.cond.cleanup.loopexit ]
|
||||
ret i32 %sum.0.lcssa
|
||||
|
||||
for.body:
|
||||
%indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
|
||||
%sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add16, %for.body ]
|
||||
%t4 = shl nsw i32 %indvars.iv, 1
|
||||
%t5 = add nsw i32 %t4, 0
|
||||
%arrayidx = getelementptr inbounds i32, i32* %g, i32 %t5
|
||||
%t6 = load i32, i32* %arrayidx, align 4
|
||||
%add1 = add nsw i32 %t6, %sum.032
|
||||
%t7 = add nsw i32 %t4, 1
|
||||
%arrayidx5 = getelementptr inbounds i32, i32* %g, i32 %t7
|
||||
%t8 = load i32, i32* %arrayidx5, align 4
|
||||
%add6 = add nsw i32 %add1, %t8
|
||||
%t9 = add nsw i32 %t4, %y
|
||||
%arrayidx10 = getelementptr inbounds i32, i32* %g, i32 %t9
|
||||
%t10 = load i32, i32* %arrayidx10, align 4
|
||||
%add11 = add nsw i32 %add6, %t10
|
||||
%t11 = add nsw i32 %t4, %z
|
||||
%arrayidx15 = getelementptr inbounds i32, i32* %g, i32 %t11
|
||||
%t12 = load i32, i32* %arrayidx15, align 4
|
||||
%add16 = add nsw i32 %add11, %t12
|
||||
%indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
|
||||
%exitcond = icmp eq i32 %indvars.iv.next , %n
|
||||
br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
|
||||
}
|
Loading…
Reference in New Issue
Block a user