diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 1027eacb3ad..ec3e734184a 100644 --- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -108,7 +108,8 @@ private: /// intervening instructions which may affect the memory accessed by the /// instructions within Chain. /// - /// The elements of \p Chain must be all loads or all stores. + /// The elements of \p Chain must be all loads or all stores and must be in + /// address order. ArrayRef getVectorizablePrefix(ArrayRef Chain); /// Collects load and store instructions to vectorize. @@ -424,6 +425,7 @@ Vectorizer::splitOddVectorElts(ArrayRef Chain, } ArrayRef Vectorizer::getVectorizablePrefix(ArrayRef Chain) { + // These are in BB order, unlike Chain, which is in address order. SmallVector, 16> MemoryInstrs; SmallVector, 16> ChainInstrs; @@ -444,31 +446,34 @@ ArrayRef Vectorizer::getVectorizablePrefix(ArrayRef Chain) { assert(Chain.size() == ChainInstrs.size() && "All instrs in Chain must be within range getBoundaryInstrs(Chain)."); - unsigned ChainIdx = 0; - for (auto EntryChain : ChainInstrs) { - Value *ChainInstrValue = EntryChain.first; - unsigned ChainInstrIdx = EntryChain.second; + // Loop until we find an instruction in ChainInstrs that we can't vectorize. + unsigned ChainInstrIdx, ChainInstrsLen; + for (ChainInstrIdx = 0, ChainInstrsLen = ChainInstrs.size(); + ChainInstrIdx < ChainInstrsLen; ++ChainInstrIdx) { + Value *ChainInstr = ChainInstrs[ChainInstrIdx].first; + unsigned ChainInstrLoc = ChainInstrs[ChainInstrIdx].second; + bool AliasFound = false; for (auto EntryMem : MemoryInstrs) { - Value *MemInstrValue = EntryMem.first; - unsigned MemInstrIdx = EntryMem.second; - if (isa(MemInstrValue) && isa(ChainInstrValue)) + Value *MemInstr = EntryMem.first; + unsigned MemInstrLoc = EntryMem.second; + if (isa(MemInstr) && isa(ChainInstr)) continue; // We can ignore the alias as long as the load comes before the store, // because that means we won't be moving the load past the store to // vectorize it (the vectorized load is inserted at the location of the // first load in the chain). - if (isa(MemInstrValue) && isa(ChainInstrValue) && - ChainInstrIdx < MemInstrIdx) + if (isa(MemInstr) && isa(ChainInstr) && + ChainInstrLoc < MemInstrLoc) continue; // Same case, but in reverse. - if (isa(MemInstrValue) && isa(ChainInstrValue) && - ChainInstrIdx > MemInstrIdx) + if (isa(MemInstr) && isa(ChainInstr) && + ChainInstrLoc > MemInstrLoc) continue; - Instruction *M0 = cast(MemInstrValue); - Instruction *M1 = cast(ChainInstrValue); + Instruction *M0 = cast(MemInstr); + Instruction *M1 = cast(ChainInstr); if (!AA.isNoAlias(MemoryLocation::get(M0), MemoryLocation::get(M1))) { DEBUG({ @@ -477,19 +482,34 @@ ArrayRef Vectorizer::getVectorizablePrefix(ArrayRef Chain) { dbgs() << "LSV: Found alias:\n" " Aliasing instruction and pointer:\n" - << " " << *MemInstrValue << '\n' + << " " << *MemInstr << '\n' << " " << *Ptr0 << '\n' << " Aliased instruction and pointer:\n" - << " " << *ChainInstrValue << '\n' + << " " << *ChainInstr << '\n' << " " << *Ptr1 << '\n'; }); - - return Chain.slice(0, ChainIdx); + AliasFound = true; + break; } } - ChainIdx++; + if (AliasFound) + break; } - return Chain; + + // Find the largest prefix of Chain whose elements are all in + // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of + // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB + // order.) + auto VectorizableChainInstrs = + makeArrayRef(ChainInstrs.data(), ChainInstrIdx); + unsigned ChainIdx, ChainLen; + for (ChainIdx = 0, ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) { + Value *V = Chain[ChainIdx]; + if (!any_of(VectorizableChainInstrs, + [V](std::pair CI) { return V == CI.first; })) + break; + } + return Chain.slice(0, ChainIdx); } std::pair diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll index 4672224672f..7078a9b5c49 100644 --- a/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll +++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll @@ -86,4 +86,34 @@ define float @insert_store_point_alias(float addrspace(1)* nocapture %a, i64 %id ret float %x } +; Here we have four stores, with an aliasing load before the last one. We +; could vectorize two of the stores before the load (although we currently +; don't), but the important thing is that we *don't* sink the store to +; a[idx + 1] below the load. +; +; CHECK-LABEL: @insert_store_point_alias_ooo +; CHECK: store float +; CHECK-SAME: %a.idx.3 +; CHECK: store float +; CHECK-SAME: %a.idx.1 +; CHECK: store float +; CHECK-SAME: %a.idx.2 +; CHECK: load float, float addrspace(1)* %a.idx.2 +; CHECK: store float +; CHECK-SAME: %a.idx +define float @insert_store_point_alias_ooo(float addrspace(1)* nocapture %a, i64 %idx) { + %a.idx = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx + %a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a.idx, i64 1 + %a.idx.2 = getelementptr inbounds float, float addrspace(1)* %a.idx.1, i64 1 + %a.idx.3 = getelementptr inbounds float, float addrspace(1)* %a.idx.2, i64 1 + + store float 0.0, float addrspace(1)* %a.idx.3, align 4 + store float 0.0, float addrspace(1)* %a.idx.1, align 4 + store float 0.0, float addrspace(1)* %a.idx.2, align 4 + %x = load float, float addrspace(1)* %a.idx.2, align 4 + store float 0.0, float addrspace(1)* %a.idx, align 4 + + ret float %x +} + attributes #0 = { nounwind }