diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index f69a4e52c7e..5b0b2e0cf19 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -412,6 +412,13 @@ public: return NumLoadsWantToChangeOrder > NumLoadsWantToKeepOrder; } + /// \return The vector element size in bits to use when vectorizing the + /// expression tree ending at \p V. If V is a store, the size is the width of + /// the stored value. Otherwise, the size is the width of the largest loaded + /// value reaching V. This method is used by the vectorizer to calculate + /// vectorization factors. + unsigned getVectorElementSize(Value *V); + private: struct TreeEntry; @@ -3139,10 +3146,73 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { BS->ScheduleStart = nullptr; } +unsigned BoUpSLP::getVectorElementSize(Value *V) { + auto &DL = F->getParent()->getDataLayout(); + + // If V is a store, just return the width of the stored value without + // traversing the expression tree. This is the common case. + if (auto *Store = dyn_cast(V)) + return DL.getTypeSizeInBits(Store->getValueOperand()->getType()); + + // If V is not a store, we can traverse the expression tree to find loads + // that feed it. The type of the loaded value may indicate a more suitable + // width than V's type. We want to base the vector element size on the width + // of memory operations where possible. + SmallVector Worklist; + SmallPtrSet Visited; + if (auto *I = dyn_cast(V)) + Worklist.push_back(I); + + // Traverse the expression tree in bottom-up order looking for loads. If we + // encounter an instruciton we don't yet handle, we give up. + auto MaxWidth = 0u; + auto FoundUnknownInst = false; + while (!Worklist.empty() && !FoundUnknownInst) { + auto *I = Worklist.pop_back_val(); + Visited.insert(I); + + // We should only be looking at scalar instructions here. If the current + // instruction has a vector type, give up. + auto *Ty = I->getType(); + if (isa(Ty)) + FoundUnknownInst = true; + + // If the current instruction is a load, update MaxWidth to reflect the + // width of the loaded value. + else if (isa(I)) + MaxWidth = std::max(MaxWidth, (unsigned)DL.getTypeSizeInBits(Ty)); + + // Otherwise, we need to visit the operands of the instruction. We only + // handle the interesting cases from buildTree here. If an operand is an + // instruction we haven't yet visited, we add it to the worklist. + else if (isa(I) || isa(I) || isa(I) || + isa(I) || isa(I) || isa(I)) { + for (Use &U : I->operands()) + if (auto *J = dyn_cast(U.get())) + if (!Visited.count(J)) + Worklist.push_back(J); + } + + // If we don't yet handle the instruction, give up. + else + FoundUnknownInst = true; + } + + // If we didn't encounter a memory access in the expression tree, or if we + // gave up for some reason, just return the width of V. + if (!MaxWidth || FoundUnknownInst) + return DL.getTypeSizeInBits(V->getType()); + + // Otherwise, return the maximum width we found. + return MaxWidth; +} + /// The SLPVectorizer Pass. struct SLPVectorizer : public FunctionPass { typedef SmallVector StoreList; typedef MapVector StoreListMap; + typedef SmallVector WeakVHList; + typedef MapVector WeakVHListMap; /// Pass identification, replacement for typeid static char ID; @@ -3172,7 +3242,8 @@ struct SLPVectorizer : public FunctionPass { DT = &getAnalysis().getDomTree(); AC = &getAnalysis().getAssumptionCache(F); - StoreRefs.clear(); + Stores.clear(); + GEPs.clear(); bool Changed = false; // If the target claims to have no vector registers don't attempt @@ -3206,15 +3277,24 @@ struct SLPVectorizer : public FunctionPass { // Scan the blocks in the function in post order. for (auto BB : post_order(&F.getEntryBlock())) { + collectSeedInstructions(BB); + // Vectorize trees that end at stores. - if (unsigned count = collectStores(BB, R)) { - (void)count; - DEBUG(dbgs() << "SLP: Found " << count << " stores to vectorize.\n"); + if (NumStores > 0) { + DEBUG(dbgs() << "SLP: Found " << NumStores << " stores.\n"); Changed |= vectorizeStoreChains(R); } // Vectorize trees that end at reductions. Changed |= vectorizeChainsInBlock(BB, R); + + // Vectorize the index computations of getelementptr instructions. This + // is primarily intended to catch gather-like idioms ending at + // non-consecutive loads. + if (NumGEPs > 0) { + DEBUG(dbgs() << "SLP: Found " << NumGEPs << " GEPs.\n"); + Changed |= vectorizeGEPIndices(BB, R); + } } if (Changed) { @@ -3241,12 +3321,14 @@ struct SLPVectorizer : public FunctionPass { } private: - - /// \brief Collect memory references and sort them according to their base - /// object. We sort the stores to their base objects to reduce the cost of the - /// quadratic search on the stores. TODO: We can further reduce this cost - /// if we flush the chain creation every time we run into a memory barrier. - unsigned collectStores(BasicBlock *BB, BoUpSLP &R); + /// \brief Collect store and getelementptr instructions and organize them + /// according to the underlying object of their pointer operands. We sort the + /// instructions by their underlying objects to reduce the cost of + /// consecutive access queries. + /// + /// TODO: We can further reduce this cost if we flush the chain creation + /// every time we run into a memory barrier. + void collectSeedInstructions(BasicBlock *BB); /// \brief Try to vectorize a chain that starts at two arithmetic instrs. bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); @@ -3262,9 +3344,13 @@ private: /// \brief Try to vectorize a chain that may start at the operands of \V; bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); - /// \brief Vectorize the stores that were collected in StoreRefs. + /// \brief Vectorize the store instructions collected in Stores. bool vectorizeStoreChains(BoUpSLP &R); + /// \brief Vectorize the index computations of the getelementptr instructions + /// collected in GEPs. + bool vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R); + /// \brief Scan the basic block and look for patterns that are likely to start /// a vectorization chain. bool vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R); @@ -3274,8 +3360,19 @@ private: bool vectorizeStores(ArrayRef Stores, int costThreshold, BoUpSLP &R); -private: - StoreListMap StoreRefs; + + /// The store instructions in a basic block organized by base pointer. + StoreListMap Stores; + + /// The getelementptr instructions in a basic block organized by base pointer. + WeakVHListMap GEPs; + + /// The number of store instructions in a basic block. + unsigned NumStores; + + /// The number of getelementptr instructions in a basic block. + unsigned NumGEPs; + unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. }; @@ -3296,9 +3393,7 @@ bool SLPVectorizer::vectorizeStoreChain(ArrayRef Chain, unsigned ChainLen = Chain.size(); DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << ChainLen << "\n"); - Type *StoreTy = cast(Chain[0])->getValueOperand()->getType(); - auto &DL = cast(Chain[0])->getModule()->getDataLayout(); - unsigned Sz = DL.getTypeSizeInBits(StoreTy); + unsigned Sz = R.getVectorElementSize(Chain[0]); unsigned VF = VecRegSize / Sz; if (!isPowerOf2_32(Sz) || VF < 2) @@ -3409,33 +3504,43 @@ bool SLPVectorizer::vectorizeStores(ArrayRef Stores, return Changed; } +void SLPVectorizer::collectSeedInstructions(BasicBlock *BB) { -unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) { - unsigned count = 0; - StoreRefs.clear(); + // Initialize the collections. We will make a single pass over the block. + Stores.clear(); + GEPs.clear(); + NumStores = NumGEPs = 0; const DataLayout &DL = BB->getModule()->getDataLayout(); + + // Visit the store and getelementptr instructions in BB and organize them in + // Stores and GEPs according to the underlying objects of their pointer + // operands. for (Instruction &I : *BB) { - StoreInst *SI = dyn_cast(&I); - if (!SI) - continue; - // Don't touch volatile stores. - if (!SI->isSimple()) - continue; + // Ignore store instructions that are volatile or have a pointer operand + // that doesn't point to a scalar type. + if (auto *SI = dyn_cast(&I)) { + if (!SI->isSimple()) + continue; + if (!isValidElementType(SI->getValueOperand()->getType())) + continue; + Stores[GetUnderlyingObject(SI->getPointerOperand(), DL)].push_back(SI); + ++NumStores; + } - // Check that the pointer points to scalars. - Type *Ty = SI->getValueOperand()->getType(); - if (!isValidElementType(Ty)) - continue; - - // Find the base pointer. - Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL); - - // Save the store locations. - StoreRefs[Ptr].push_back(SI); - count++; + // Ignore getelementptr instructions that have more than one index, a + // constant index, or a pointer operand that doesn't point to a scalar + // type. + else if (auto *GEP = dyn_cast(&I)) { + auto Idx = GEP->idx_begin()->get(); + if (GEP->getNumIndices() > 1 || isa(Idx)) + continue; + if (!isValidElementType(Idx->getType())) + continue; + GEPs[GetUnderlyingObject(GEP->getPointerOperand(), DL)].push_back(GEP); + ++NumGEPs; + } } - return count; } bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) { @@ -3459,12 +3564,10 @@ bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, return false; unsigned Opcode0 = I0->getOpcode(); - const DataLayout &DL = I0->getModule()->getDataLayout(); - Type *Ty0 = I0->getType(); - unsigned Sz = DL.getTypeSizeInBits(Ty0); // FIXME: Register size should be a parameter to this function, so we can // try different vectorization factors. + unsigned Sz = R.getVectorElementSize(I0); unsigned VF = MinVecRegSize / Sz; for (Value *V : VL) { @@ -4183,10 +4286,83 @@ bool SLPVectorizer::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) { return Changed; } +bool SLPVectorizer::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) { + auto Changed = false; + for (auto &Entry : GEPs) { + auto &GEPList = Entry.second; + + // If the getelementptr list has fewer than two elements, there's nothing + // to do. + if (GEPList.size() < 2) + continue; + + DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length " + << GEPList.size() << ".\n"); + + // Initialize a set a candidate getelementptrs. Note that we use a + // SetVector here to preserve program order. If the index computations are + // vectorizable and begin with loads, we want to minimize the chance of + // having to reorder them later. + SetVector Candidates(GEPList.begin(), GEPList.end()); + + // Some of the candidates may have already been vectorized after we + // initially collected them. If so, the WeakVHs will have nullified the + // values, so remove them from the set of candidates. + Candidates.remove(nullptr); + + // Remove from the set of candidates all pairs of getelementptrs with + // constant differences. Such getelementptrs are likely not good candidates + // for vectorization in a bottom-up phase since one can be computed from + // the other. + for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) { + auto *GEP = SE->getSCEV(GEPList[I]); + for (int J = I + 1; J < E && Candidates.size() > 1; ++J) + if (isa(SE->getMinusSCEV(GEP, SE->getSCEV(GEPList[J])))) { + Candidates.remove(GEPList[I]); + Candidates.remove(GEPList[J]); + } + } + + // We break out of the above computation as soon as we know there are fewer + // than two candidates remaining. + if (Candidates.size() < 2) + continue; + + // Add the single, non-constant index of each candidate to the bundle. We + // ensured the indices met these constraints when we originally collected + // the getelementptrs. + SmallVector Bundle(Candidates.size()); + auto BundleIndex = 0u; + for (auto *V : Candidates) { + auto *GEP = cast(V); + auto *GEPIdx = GEP->idx_begin()->get(); + assert(GEP->getNumIndices() == 1 || !isa(GEPIdx)); + Bundle[BundleIndex++] = GEPIdx; + } + + // Try and vectorize the indices. We are currently only interested in + // gather-like cases of the form: + // + // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ... + // + // where the loads of "a", the loads of "b", and the subtractions can be + // performed in parallel. It's likely that detecting this pattern in a + // bottom-up phase will be simpler and less costly than building a + // full-blown top-down phase beginning at the consecutive loads. We process + // the bundle in chunks of 16 (like we do for stores) to minimize + // compile-time. + for (unsigned BI = 0, BE = Bundle.size(); BI < BE; BI += 16) { + auto Len = std::min(BE - BI, 16); + Changed |= tryToVectorizeList(makeArrayRef(&Bundle[BI], Len), R); + } + } + return Changed; +} + bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) { bool Changed = false; // Attempt to sort and vectorize each of the store-groups. - for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end(); + for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e; ++it) { if (it->second.size() < 2) continue; diff --git a/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll b/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll new file mode 100644 index 00000000000..59ceba1717a --- /dev/null +++ b/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll @@ -0,0 +1,258 @@ +; RUN: opt -S -slp-vectorizer -dce -instcombine < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; These tests check that we vectorize the index calculations in the +; gather-reduce pattern shown below. We check cases having i32 and i64 +; subtraction. +; +; int gather_reduce_8x16(short *a, short *b, short *g, int n) { +; int sum = 0; +; for (int i = 0; i < n ; ++i) { +; sum += g[*a++ - b[0]]; sum += g[*a++ - b[1]]; +; sum += g[*a++ - b[2]]; sum += g[*a++ - b[3]]; +; sum += g[*a++ - b[4]]; sum += g[*a++ - b[5]]; +; sum += g[*a++ - b[6]]; sum += g[*a++ - b[7]]; +; } +; return sum; +; } + +; CHECK-LABEL: @gather_reduce_8x16_i32 +; +; CHECK: [[L:%[a-zA-Z0-9.]+]] = load <8 x i16> +; CHECK: zext <8 x i16> [[L]] to <8 x i32> +; CHECK: [[S:%[a-zA-Z0-9.]+]] = sub nsw <8 x i32> +; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <8 x i32> [[S]] +; CHECK: sext i32 [[X]] to i64 +; +define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) { +entry: + %cmp.99 = icmp sgt i32 %n, 0 + br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ] + ret i32 %sum.0.lcssa + +for.body: + %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ] + %a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ] + %incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1 + %0 = load i16, i16* %a.addr.0101, align 2 + %conv = zext i16 %0 to i32 + %incdec.ptr1 = getelementptr inbounds i16, i16* %b, i64 1 + %1 = load i16, i16* %b, align 2 + %conv2 = zext i16 %1 to i32 + %sub = sub nsw i32 %conv, %conv2 + %arrayidx = getelementptr inbounds i16, i16* %g, i32 %sub + %2 = load i16, i16* %arrayidx, align 2 + %conv3 = zext i16 %2 to i32 + %add = add nsw i32 %conv3, %sum.0102 + %incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2 + %3 = load i16, i16* %incdec.ptr, align 2 + %conv5 = zext i16 %3 to i32 + %incdec.ptr6 = getelementptr inbounds i16, i16* %b, i64 2 + %4 = load i16, i16* %incdec.ptr1, align 2 + %conv7 = zext i16 %4 to i32 + %sub8 = sub nsw i32 %conv5, %conv7 + %arrayidx10 = getelementptr inbounds i16, i16* %g, i32 %sub8 + %5 = load i16, i16* %arrayidx10, align 2 + %conv11 = zext i16 %5 to i32 + %add12 = add nsw i32 %add, %conv11 + %incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3 + %6 = load i16, i16* %incdec.ptr4, align 2 + %conv14 = zext i16 %6 to i32 + %incdec.ptr15 = getelementptr inbounds i16, i16* %b, i64 3 + %7 = load i16, i16* %incdec.ptr6, align 2 + %conv16 = zext i16 %7 to i32 + %sub17 = sub nsw i32 %conv14, %conv16 + %arrayidx19 = getelementptr inbounds i16, i16* %g, i32 %sub17 + %8 = load i16, i16* %arrayidx19, align 2 + %conv20 = zext i16 %8 to i32 + %add21 = add nsw i32 %add12, %conv20 + %incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4 + %9 = load i16, i16* %incdec.ptr13, align 2 + %conv23 = zext i16 %9 to i32 + %incdec.ptr24 = getelementptr inbounds i16, i16* %b, i64 4 + %10 = load i16, i16* %incdec.ptr15, align 2 + %conv25 = zext i16 %10 to i32 + %sub26 = sub nsw i32 %conv23, %conv25 + %arrayidx28 = getelementptr inbounds i16, i16* %g, i32 %sub26 + %11 = load i16, i16* %arrayidx28, align 2 + %conv29 = zext i16 %11 to i32 + %add30 = add nsw i32 %add21, %conv29 + %incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5 + %12 = load i16, i16* %incdec.ptr22, align 2 + %conv32 = zext i16 %12 to i32 + %incdec.ptr33 = getelementptr inbounds i16, i16* %b, i64 5 + %13 = load i16, i16* %incdec.ptr24, align 2 + %conv34 = zext i16 %13 to i32 + %sub35 = sub nsw i32 %conv32, %conv34 + %arrayidx37 = getelementptr inbounds i16, i16* %g, i32 %sub35 + %14 = load i16, i16* %arrayidx37, align 2 + %conv38 = zext i16 %14 to i32 + %add39 = add nsw i32 %add30, %conv38 + %incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6 + %15 = load i16, i16* %incdec.ptr31, align 2 + %conv41 = zext i16 %15 to i32 + %incdec.ptr42 = getelementptr inbounds i16, i16* %b, i64 6 + %16 = load i16, i16* %incdec.ptr33, align 2 + %conv43 = zext i16 %16 to i32 + %sub44 = sub nsw i32 %conv41, %conv43 + %arrayidx46 = getelementptr inbounds i16, i16* %g, i32 %sub44 + %17 = load i16, i16* %arrayidx46, align 2 + %conv47 = zext i16 %17 to i32 + %add48 = add nsw i32 %add39, %conv47 + %incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7 + %18 = load i16, i16* %incdec.ptr40, align 2 + %conv50 = zext i16 %18 to i32 + %incdec.ptr51 = getelementptr inbounds i16, i16* %b, i64 7 + %19 = load i16, i16* %incdec.ptr42, align 2 + %conv52 = zext i16 %19 to i32 + %sub53 = sub nsw i32 %conv50, %conv52 + %arrayidx55 = getelementptr inbounds i16, i16* %g, i32 %sub53 + %20 = load i16, i16* %arrayidx55, align 2 + %conv56 = zext i16 %20 to i32 + %add57 = add nsw i32 %add48, %conv56 + %incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8 + %21 = load i16, i16* %incdec.ptr49, align 2 + %conv59 = zext i16 %21 to i32 + %22 = load i16, i16* %incdec.ptr51, align 2 + %conv61 = zext i16 %22 to i32 + %sub62 = sub nsw i32 %conv59, %conv61 + %arrayidx64 = getelementptr inbounds i16, i16* %g, i32 %sub62 + %23 = load i16, i16* %arrayidx64, align 2 + %conv65 = zext i16 %23 to i32 + %add66 = add nsw i32 %add57, %conv65 + %inc = add nuw nsw i32 %i.0103, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + +; CHECK-LABEL: @gather_reduce_8x16_i64 +; +; CHECK-NOT: load <8 x i16> +; +; FIXME: We are currently unable to vectorize the case with i64 subtraction +; because the zero extensions are too expensive. The solution here is to +; convert the i64 subtractions to i32 subtractions during vectorization. +; This would then match the case above. +; +define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture readonly %b, i16* nocapture readonly %g, i32 %n) { +entry: + %cmp.99 = icmp sgt i32 %n, 0 + br i1 %cmp.99, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add66, %for.cond.cleanup.loopexit ] + ret i32 %sum.0.lcssa + +for.body: + %i.0103 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %sum.0102 = phi i32 [ %add66, %for.body ], [ 0, %for.body.preheader ] + %a.addr.0101 = phi i16* [ %incdec.ptr58, %for.body ], [ %a, %for.body.preheader ] + %incdec.ptr = getelementptr inbounds i16, i16* %a.addr.0101, i64 1 + %0 = load i16, i16* %a.addr.0101, align 2 + %conv = zext i16 %0 to i64 + %incdec.ptr1 = getelementptr inbounds i16, i16* %b, i64 1 + %1 = load i16, i16* %b, align 2 + %conv2 = zext i16 %1 to i64 + %sub = sub nsw i64 %conv, %conv2 + %arrayidx = getelementptr inbounds i16, i16* %g, i64 %sub + %2 = load i16, i16* %arrayidx, align 2 + %conv3 = zext i16 %2 to i32 + %add = add nsw i32 %conv3, %sum.0102 + %incdec.ptr4 = getelementptr inbounds i16, i16* %a.addr.0101, i64 2 + %3 = load i16, i16* %incdec.ptr, align 2 + %conv5 = zext i16 %3 to i64 + %incdec.ptr6 = getelementptr inbounds i16, i16* %b, i64 2 + %4 = load i16, i16* %incdec.ptr1, align 2 + %conv7 = zext i16 %4 to i64 + %sub8 = sub nsw i64 %conv5, %conv7 + %arrayidx10 = getelementptr inbounds i16, i16* %g, i64 %sub8 + %5 = load i16, i16* %arrayidx10, align 2 + %conv11 = zext i16 %5 to i32 + %add12 = add nsw i32 %add, %conv11 + %incdec.ptr13 = getelementptr inbounds i16, i16* %a.addr.0101, i64 3 + %6 = load i16, i16* %incdec.ptr4, align 2 + %conv14 = zext i16 %6 to i64 + %incdec.ptr15 = getelementptr inbounds i16, i16* %b, i64 3 + %7 = load i16, i16* %incdec.ptr6, align 2 + %conv16 = zext i16 %7 to i64 + %sub17 = sub nsw i64 %conv14, %conv16 + %arrayidx19 = getelementptr inbounds i16, i16* %g, i64 %sub17 + %8 = load i16, i16* %arrayidx19, align 2 + %conv20 = zext i16 %8 to i32 + %add21 = add nsw i32 %add12, %conv20 + %incdec.ptr22 = getelementptr inbounds i16, i16* %a.addr.0101, i64 4 + %9 = load i16, i16* %incdec.ptr13, align 2 + %conv23 = zext i16 %9 to i64 + %incdec.ptr24 = getelementptr inbounds i16, i16* %b, i64 4 + %10 = load i16, i16* %incdec.ptr15, align 2 + %conv25 = zext i16 %10 to i64 + %sub26 = sub nsw i64 %conv23, %conv25 + %arrayidx28 = getelementptr inbounds i16, i16* %g, i64 %sub26 + %11 = load i16, i16* %arrayidx28, align 2 + %conv29 = zext i16 %11 to i32 + %add30 = add nsw i32 %add21, %conv29 + %incdec.ptr31 = getelementptr inbounds i16, i16* %a.addr.0101, i64 5 + %12 = load i16, i16* %incdec.ptr22, align 2 + %conv32 = zext i16 %12 to i64 + %incdec.ptr33 = getelementptr inbounds i16, i16* %b, i64 5 + %13 = load i16, i16* %incdec.ptr24, align 2 + %conv34 = zext i16 %13 to i64 + %sub35 = sub nsw i64 %conv32, %conv34 + %arrayidx37 = getelementptr inbounds i16, i16* %g, i64 %sub35 + %14 = load i16, i16* %arrayidx37, align 2 + %conv38 = zext i16 %14 to i32 + %add39 = add nsw i32 %add30, %conv38 + %incdec.ptr40 = getelementptr inbounds i16, i16* %a.addr.0101, i64 6 + %15 = load i16, i16* %incdec.ptr31, align 2 + %conv41 = zext i16 %15 to i64 + %incdec.ptr42 = getelementptr inbounds i16, i16* %b, i64 6 + %16 = load i16, i16* %incdec.ptr33, align 2 + %conv43 = zext i16 %16 to i64 + %sub44 = sub nsw i64 %conv41, %conv43 + %arrayidx46 = getelementptr inbounds i16, i16* %g, i64 %sub44 + %17 = load i16, i16* %arrayidx46, align 2 + %conv47 = zext i16 %17 to i32 + %add48 = add nsw i32 %add39, %conv47 + %incdec.ptr49 = getelementptr inbounds i16, i16* %a.addr.0101, i64 7 + %18 = load i16, i16* %incdec.ptr40, align 2 + %conv50 = zext i16 %18 to i64 + %incdec.ptr51 = getelementptr inbounds i16, i16* %b, i64 7 + %19 = load i16, i16* %incdec.ptr42, align 2 + %conv52 = zext i16 %19 to i64 + %sub53 = sub nsw i64 %conv50, %conv52 + %arrayidx55 = getelementptr inbounds i16, i16* %g, i64 %sub53 + %20 = load i16, i16* %arrayidx55, align 2 + %conv56 = zext i16 %20 to i32 + %add57 = add nsw i32 %add48, %conv56 + %incdec.ptr58 = getelementptr inbounds i16, i16* %a.addr.0101, i64 8 + %21 = load i16, i16* %incdec.ptr49, align 2 + %conv59 = zext i16 %21 to i64 + %22 = load i16, i16* %incdec.ptr51, align 2 + %conv61 = zext i16 %22 to i64 + %sub62 = sub nsw i64 %conv59, %conv61 + %arrayidx64 = getelementptr inbounds i16, i16* %g, i64 %sub62 + %23 = load i16, i16* %arrayidx64, align 2 + %conv65 = zext i16 %23 to i32 + %add66 = add nsw i32 %add57, %conv65 + %inc = add nuw nsw i32 %i.0103, 1 + %exitcond = icmp eq i32 %inc, %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} diff --git a/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll b/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll new file mode 100644 index 00000000000..e9b71963530 --- /dev/null +++ b/test/Transforms/SLPVectorizer/AArch64/getelementptr.ll @@ -0,0 +1,111 @@ +; RUN: opt -S -slp-vectorizer -slp-threshold=-18 -dce -instcombine < %s | FileCheck %s + +target datalayout = "e-m:e-i32:64-i128:128-n32:64-S128" +target triple = "aarch64--linux-gnu" + +; These tests check that we remove from consideration pairs of seed +; getelementptrs when they are known to have a constant difference. Such pairs +; are likely not good candidates for vectorization since one can be computed +; from the other. We use an unprofitable threshold to force vectorization. +; +; int getelementptr(int *g, int n, int w, int x, int y, int z) { +; int sum = 0; +; for (int i = 0; i < n ; ++i) { +; sum += g[2*i + w]; sum += g[2*i + x]; +; sum += g[2*i + y]; sum += g[2*i + z]; +; } +; return sum; +; } +; + +; CHECK-LABEL: @getelementptr_4x32 +; +; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <4 x i32> +; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <4 x i32> [[A]] +; CHECK: sext i32 [[X]] to i64 +; +define i32 @getelementptr_4x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) { +entry: + %cmp31 = icmp sgt i32 %n, 0 + br i1 %cmp31, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add16, %for.cond.cleanup.loopexit ] + ret i32 %sum.0.lcssa + +for.body: + %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add16, %for.body ] + %t4 = shl nsw i32 %indvars.iv, 1 + %t5 = add nsw i32 %t4, 0 + %arrayidx = getelementptr inbounds i32, i32* %g, i32 %t5 + %t6 = load i32, i32* %arrayidx, align 4 + %add1 = add nsw i32 %t6, %sum.032 + %t7 = add nsw i32 %t4, %x + %arrayidx5 = getelementptr inbounds i32, i32* %g, i32 %t7 + %t8 = load i32, i32* %arrayidx5, align 4 + %add6 = add nsw i32 %add1, %t8 + %t9 = add nsw i32 %t4, %y + %arrayidx10 = getelementptr inbounds i32, i32* %g, i32 %t9 + %t10 = load i32, i32* %arrayidx10, align 4 + %add11 = add nsw i32 %add6, %t10 + %t11 = add nsw i32 %t4, %z + %arrayidx15 = getelementptr inbounds i32, i32* %g, i32 %t11 + %t12 = load i32, i32* %arrayidx15, align 4 + %add16 = add nsw i32 %add11, %t12 + %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 + %exitcond = icmp eq i32 %indvars.iv.next , %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +} + +; CHECK-LABEL: @getelementptr_2x32 +; +; CHECK: [[A:%[a-zA-Z0-9.]+]] = add nsw <2 x i32> +; CHECK: [[X:%[a-zA-Z0-9.]+]] = extractelement <2 x i32> [[A]] +; CHECK: sext i32 [[X]] to i64 +; +define i32 @getelementptr_2x32(i32* nocapture readonly %g, i32 %n, i32 %x, i32 %y, i32 %z) { +entry: + %cmp31 = icmp sgt i32 %n, 0 + br i1 %cmp31, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + br label %for.body + +for.cond.cleanup.loopexit: + br label %for.cond.cleanup + +for.cond.cleanup: + %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add16, %for.cond.cleanup.loopexit ] + ret i32 %sum.0.lcssa + +for.body: + %indvars.iv = phi i32 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add16, %for.body ] + %t4 = shl nsw i32 %indvars.iv, 1 + %t5 = add nsw i32 %t4, 0 + %arrayidx = getelementptr inbounds i32, i32* %g, i32 %t5 + %t6 = load i32, i32* %arrayidx, align 4 + %add1 = add nsw i32 %t6, %sum.032 + %t7 = add nsw i32 %t4, 1 + %arrayidx5 = getelementptr inbounds i32, i32* %g, i32 %t7 + %t8 = load i32, i32* %arrayidx5, align 4 + %add6 = add nsw i32 %add1, %t8 + %t9 = add nsw i32 %t4, %y + %arrayidx10 = getelementptr inbounds i32, i32* %g, i32 %t9 + %t10 = load i32, i32* %arrayidx10, align 4 + %add11 = add nsw i32 %add6, %t10 + %t11 = add nsw i32 %t4, %z + %arrayidx15 = getelementptr inbounds i32, i32* %g, i32 %t11 + %t12 = load i32, i32* %arrayidx15, align 4 + %add16 = add nsw i32 %add11, %t12 + %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1 + %exitcond = icmp eq i32 %indvars.iv.next , %n + br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body +}