1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-21 12:02:58 +02:00
llvm-mirror/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
Roman Tereshin c9bf1bce58 [LSV] Refactoring + supporting bitcasts to a type of different size
This is mostly a preparation work for adding a limited support for
select instructions. It proved to be difficult to do due to size and
irregularity of Vectorizer::isConsecutiveAccess, this is fixed here I
believe.

It also turned out that these changes make it simpler to finish one of
the TODOs and fix a number of other small issues, namely:

1. Looking through bitcasts to a type of a different size (requires
careful tracking of the original load/store size and some math
converting sizes in bytes to expected differences in indices of GEPs).

2. Reusing partial analysis of pointers done by first attempt in proving
them consecutive instead of starting from scratch. This added limited
support for nested GEPs co-existing with difficult sext/zext
instructions. This also required a careful handling of negative
differences between constant parts of offsets.

3. Handing a case where the first pointer index is not an add, but
something else (a function parameter for instance).

I observe an increased number of successful vectorizations on a large
set of shader programs. Only few shaders are affected, but those that
are affected sport >5% less loads and stores than before the patch.

Reviewed By: rampitec

Differential-Revision: https://reviews.llvm.org/D49342
llvm-svn: 337489
2018-07-19 19:42:43 +00:00

1182 lines
42 KiB
C++

//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This pass merges loads/stores to/from sequential memory addresses into vector
// loads/stores. Although there's nothing GPU-specific in here, this pass is
// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
//
// (For simplicity below we talk about loads only, but everything also applies
// to stores.)
//
// This pass is intended to be run late in the pipeline, after other
// vectorization opportunities have been exploited. So the assumption here is
// that immediately following our new vector load we'll need to extract out the
// individual elements of the load, so we can operate on them individually.
//
// On CPUs this transformation is usually not beneficial, because extracting the
// elements of a vector register is expensive on most architectures. It's
// usually better just to load each element individually into its own scalar
// register.
//
// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
// "vector load" loads directly into a series of scalar registers. In effect,
// extracting the elements of the vector is free. It's therefore always
// beneficial to vectorize a sequence of loads on these architectures.
//
// Vectorizing (perhaps a better name might be "coalescing") loads can have
// large performance impacts on GPU kernels, and opportunities for vectorizing
// are common in GPU code. This pass tries very hard to find such
// opportunities; its runtime is quadratic in the number of loads in a BB.
//
// Some CPU architectures, such as ARM, have instructions that load into
// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
// could use this pass (with some modifications), but currently it implements
// its own pass to do something similar to what we do here.
#include "llvm/ADT/APInt.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/OrderedBasicBlock.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/User.h"
#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Vectorize.h"
#include <algorithm>
#include <cassert>
#include <cstdlib>
#include <tuple>
#include <utility>
using namespace llvm;
#define DEBUG_TYPE "load-store-vectorizer"
STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
// FIXME: Assuming stack alignment of 4 is always good enough
static const unsigned StackAdjustedAlignment = 4;
namespace {
using InstrList = SmallVector<Instruction *, 8>;
using InstrListMap = MapVector<Value *, InstrList>;
class Vectorizer {
Function &F;
AliasAnalysis &AA;
DominatorTree &DT;
ScalarEvolution &SE;
TargetTransformInfo &TTI;
const DataLayout &DL;
IRBuilder<> Builder;
public:
Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,
ScalarEvolution &SE, TargetTransformInfo &TTI)
: F(F), AA(AA), DT(DT), SE(SE), TTI(TTI),
DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
bool run();
private:
unsigned getPointerAddressSpace(Value *I);
unsigned getAlignment(LoadInst *LI) const {
unsigned Align = LI->getAlignment();
if (Align != 0)
return Align;
return DL.getABITypeAlignment(LI->getType());
}
unsigned getAlignment(StoreInst *SI) const {
unsigned Align = SI->getAlignment();
if (Align != 0)
return Align;
return DL.getABITypeAlignment(SI->getValueOperand()->getType());
}
bool isConsecutiveAccess(Value *A, Value *B);
bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt Size);
bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta);
/// After vectorization, reorder the instructions that I depends on
/// (the instructions defining its operands), to ensure they dominate I.
void reorder(Instruction *I);
/// Returns the first and the last instructions in Chain.
std::pair<BasicBlock::iterator, BasicBlock::iterator>
getBoundaryInstrs(ArrayRef<Instruction *> Chain);
/// Erases the original instructions after vectorizing.
void eraseInstructions(ArrayRef<Instruction *> Chain);
/// "Legalize" the vector type that would be produced by combining \p
/// ElementSizeBits elements in \p Chain. Break into two pieces such that the
/// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
/// expected to have more than 4 elements.
std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
/// Finds the largest prefix of Chain that's vectorizable, checking for
/// intervening instructions which may affect the memory accessed by the
/// instructions within Chain.
///
/// The elements of \p Chain must be all loads or all stores and must be in
/// address order.
ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
/// Collects load and store instructions to vectorize.
std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
/// Processes the collected instructions, the \p Map. The values of \p Map
/// should be all loads or all stores.
bool vectorizeChains(InstrListMap &Map);
/// Finds the load/stores to consecutive memory addresses and vectorizes them.
bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
/// Vectorizes the load instructions in Chain.
bool
vectorizeLoadChain(ArrayRef<Instruction *> Chain,
SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
/// Vectorizes the store instructions in Chain.
bool
vectorizeStoreChain(ArrayRef<Instruction *> Chain,
SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
/// Check if this load/store access is misaligned accesses.
bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
unsigned Alignment);
};
class LoadStoreVectorizer : public FunctionPass {
public:
static char ID;
LoadStoreVectorizer() : FunctionPass(ID) {
initializeLoadStoreVectorizerPass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override;
StringRef getPassName() const override {
return "GPU Load and Store Vectorizer";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AAResultsWrapperPass>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.setPreservesCFG();
}
};
} // end anonymous namespace
char LoadStoreVectorizer::ID = 0;
INITIALIZE_PASS_BEGIN(LoadStoreVectorizer, DEBUG_TYPE,
"Vectorize load and Store instructions", false, false)
INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(LoadStoreVectorizer, DEBUG_TYPE,
"Vectorize load and store instructions", false, false)
Pass *llvm::createLoadStoreVectorizerPass() {
return new LoadStoreVectorizer();
}
// The real propagateMetadata expects a SmallVector<Value*>, but we deal in
// vectors of Instructions.
static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
SmallVector<Value *, 8> VL(IL.begin(), IL.end());
propagateMetadata(I, VL);
}
bool LoadStoreVectorizer::runOnFunction(Function &F) {
// Don't vectorize when the attribute NoImplicitFloat is used.
if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
return false;
AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
TargetTransformInfo &TTI =
getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
Vectorizer V(F, AA, DT, SE, TTI);
return V.run();
}
// Vectorizer Implementation
bool Vectorizer::run() {
bool Changed = false;
// Scan the blocks in the function in post order.
for (BasicBlock *BB : post_order(&F)) {
InstrListMap LoadRefs, StoreRefs;
std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
Changed |= vectorizeChains(LoadRefs);
Changed |= vectorizeChains(StoreRefs);
}
return Changed;
}
unsigned Vectorizer::getPointerAddressSpace(Value *I) {
if (LoadInst *L = dyn_cast<LoadInst>(I))
return L->getPointerAddressSpace();
if (StoreInst *S = dyn_cast<StoreInst>(I))
return S->getPointerAddressSpace();
return -1;
}
// FIXME: Merge with llvm::isConsecutiveAccess
bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
Value *PtrA = getLoadStorePointerOperand(A);
Value *PtrB = getLoadStorePointerOperand(B);
unsigned ASA = getPointerAddressSpace(A);
unsigned ASB = getPointerAddressSpace(B);
// Check that the address spaces match and that the pointers are valid.
if (!PtrA || !PtrB || (ASA != ASB))
return false;
// Make sure that A and B are different pointers of the same size type.
Type *PtrATy = PtrA->getType()->getPointerElementType();
Type *PtrBTy = PtrB->getType()->getPointerElementType();
if (PtrA == PtrB ||
PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
DL.getTypeStoreSize(PtrATy->getScalarType()) !=
DL.getTypeStoreSize(PtrBTy->getScalarType()))
return false;
unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
return areConsecutivePointers(PtrA, PtrB, Size);
}
bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB, APInt Size) {
unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
APInt OffsetA(PtrBitWidth, 0);
APInt OffsetB(PtrBitWidth, 0);
PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
APInt OffsetDelta = OffsetB - OffsetA;
// Check if they are based on the same pointer. That makes the offsets
// sufficient.
if (PtrA == PtrB)
return OffsetDelta == Size;
// Compute the necessary base pointer delta to have the necessary final delta
// equal to the size.
APInt BaseDelta = Size - OffsetDelta;
// Compute the distance with SCEV between the base pointers.
const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
const SCEV *C = SE.getConstant(BaseDelta);
const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
if (X == PtrSCEVB)
return true;
// The above check will not catch the cases where one of the pointers is
// factorized but the other one is not, such as (C + (S * (A + B))) vs
// (AS + BS). Get the minus scev. That will allow re-combining the expresions
// and getting the simplified difference.
const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
if (C == Dist)
return true;
// Sometimes even this doesn't work, because SCEV can't always see through
// patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
// things the hard way.
return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta);
}
bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
APInt PtrDelta) {
auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
if (!GEPA || !GEPB)
return false;
// Look through GEPs after checking they're the same except for the last
// index.
if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
GEPA->getPointerOperand() != GEPB->getPointerOperand())
return false;
gep_type_iterator GTIA = gep_type_begin(GEPA);
gep_type_iterator GTIB = gep_type_begin(GEPB);
for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
if (GTIA.getOperand() != GTIB.getOperand())
return false;
++GTIA;
++GTIB;
}
Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
OpA->getType() != OpB->getType())
return false;
if (PtrDelta.isNegative()) {
if (PtrDelta.isMinSignedValue())
return false;
PtrDelta.negate();
std::swap(OpA, OpB);
}
uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
if (PtrDelta.urem(Stride) != 0)
return false;
unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
// Only look through a ZExt/SExt.
if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
return false;
bool Signed = isa<SExtInst>(OpA);
// At this point A could be a function parameter, i.e. not an instruction
Value *ValA = OpA->getOperand(0);
OpB = dyn_cast<Instruction>(OpB->getOperand(0));
if (!OpB || ValA->getType() != OpB->getType())
return false;
// Now we need to prove that adding IdxDiff to ValA won't overflow.
bool Safe = false;
// First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
// ValA, we're okay.
if (OpB->getOpcode() == Instruction::Add &&
isa<ConstantInt>(OpB->getOperand(1)) &&
IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) {
if (Signed)
Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
else
Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
}
unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
// Second attempt:
// If all set bits of IdxDiff or any higher order bit other than the sign bit
// are known to be zero in ValA, we can add Diff to it while guaranteeing no
// overflow of any sort.
if (!Safe) {
OpA = dyn_cast<Instruction>(ValA);
if (!OpA)
return false;
KnownBits Known(BitWidth);
computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
if (Known.Zero.trunc(BitWidth - 1).zext(IdxBitWidth).ult(IdxDiff))
return false;
}
const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
return X == OffsetSCEVB;
}
void Vectorizer::reorder(Instruction *I) {
OrderedBasicBlock OBB(I->getParent());
SmallPtrSet<Instruction *, 16> InstructionsToMove;
SmallVector<Instruction *, 16> Worklist;
Worklist.push_back(I);
while (!Worklist.empty()) {
Instruction *IW = Worklist.pop_back_val();
int NumOperands = IW->getNumOperands();
for (int i = 0; i < NumOperands; i++) {
Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
if (!IM || IM->getOpcode() == Instruction::PHI)
continue;
// If IM is in another BB, no need to move it, because this pass only
// vectorizes instructions within one BB.
if (IM->getParent() != I->getParent())
continue;
if (!OBB.dominates(IM, I)) {
InstructionsToMove.insert(IM);
Worklist.push_back(IM);
}
}
}
// All instructions to move should follow I. Start from I, not from begin().
for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
++BBI) {
if (!InstructionsToMove.count(&*BBI))
continue;
Instruction *IM = &*BBI;
--BBI;
IM->removeFromParent();
IM->insertBefore(I);
}
}
std::pair<BasicBlock::iterator, BasicBlock::iterator>
Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
Instruction *C0 = Chain[0];
BasicBlock::iterator FirstInstr = C0->getIterator();
BasicBlock::iterator LastInstr = C0->getIterator();
BasicBlock *BB = C0->getParent();
unsigned NumFound = 0;
for (Instruction &I : *BB) {
if (!is_contained(Chain, &I))
continue;
++NumFound;
if (NumFound == 1) {
FirstInstr = I.getIterator();
}
if (NumFound == Chain.size()) {
LastInstr = I.getIterator();
break;
}
}
// Range is [first, last).
return std::make_pair(FirstInstr, ++LastInstr);
}
void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
SmallVector<Instruction *, 16> Instrs;
for (Instruction *I : Chain) {
Value *PtrOperand = getLoadStorePointerOperand(I);
assert(PtrOperand && "Instruction must have a pointer operand.");
Instrs.push_back(I);
if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
Instrs.push_back(GEP);
}
// Erase instructions.
for (Instruction *I : Instrs)
if (I->use_empty())
I->eraseFromParent();
}
std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
unsigned ElementSizeBits) {
unsigned ElementSizeBytes = ElementSizeBits / 8;
unsigned SizeBytes = ElementSizeBytes * Chain.size();
unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
if (NumLeft == Chain.size()) {
if ((NumLeft & 1) == 0)
NumLeft /= 2; // Split even in half
else
--NumLeft; // Split off last element
} else if (NumLeft == 0)
NumLeft = 1;
return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
}
ArrayRef<Instruction *>
Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
// These are in BB order, unlike Chain, which is in address order.
SmallVector<Instruction *, 16> MemoryInstrs;
SmallVector<Instruction *, 16> ChainInstrs;
bool IsLoadChain = isa<LoadInst>(Chain[0]);
LLVM_DEBUG({
for (Instruction *I : Chain) {
if (IsLoadChain)
assert(isa<LoadInst>(I) &&
"All elements of Chain must be loads, or all must be stores.");
else
assert(isa<StoreInst>(I) &&
"All elements of Chain must be loads, or all must be stores.");
}
});
for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
if (!is_contained(Chain, &I))
MemoryInstrs.push_back(&I);
else
ChainInstrs.push_back(&I);
} else if (isa<IntrinsicInst>(&I) &&
cast<IntrinsicInst>(&I)->getIntrinsicID() ==
Intrinsic::sideeffect) {
// Ignore llvm.sideeffect calls.
} else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
<< '\n');
break;
} else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
<< '\n');
break;
}
}
OrderedBasicBlock OBB(Chain[0]->getParent());
// Loop until we find an instruction in ChainInstrs that we can't vectorize.
unsigned ChainInstrIdx = 0;
Instruction *BarrierMemoryInstr = nullptr;
for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
// If a barrier memory instruction was found, chain instructions that follow
// will not be added to the valid prefix.
if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr))
break;
// Check (in BB order) if any instruction prevents ChainInstr from being
// vectorized. Find and store the first such "conflicting" instruction.
for (Instruction *MemInstr : MemoryInstrs) {
// If a barrier memory instruction was found, do not check past it.
if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))
break;
auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
if (MemLoad && ChainLoad)
continue;
// We can ignore the alias if the we have a load store pair and the load
// is known to be invariant. The load cannot be clobbered by the store.
auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
return LI->getMetadata(LLVMContext::MD_invariant_load);
};
// We can ignore the alias as long as the load comes before the store,
// because that means we won't be moving the load past the store to
// vectorize it (the vectorized load is inserted at the location of the
// first load in the chain).
if (isa<StoreInst>(MemInstr) && ChainLoad &&
(IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr)))
continue;
// Same case, but in reverse.
if (MemLoad && isa<StoreInst>(ChainInstr) &&
(IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr)))
continue;
if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
MemoryLocation::get(ChainInstr))) {
LLVM_DEBUG({
dbgs() << "LSV: Found alias:\n"
" Aliasing instruction and pointer:\n"
<< " " << *MemInstr << '\n'
<< " " << *getLoadStorePointerOperand(MemInstr) << '\n'
<< " Aliased instruction and pointer:\n"
<< " " << *ChainInstr << '\n'
<< " " << *getLoadStorePointerOperand(ChainInstr) << '\n';
});
// Save this aliasing memory instruction as a barrier, but allow other
// instructions that precede the barrier to be vectorized with this one.
BarrierMemoryInstr = MemInstr;
break;
}
}
// Continue the search only for store chains, since vectorizing stores that
// precede an aliasing load is valid. Conversely, vectorizing loads is valid
// up to an aliasing store, but should not pull loads from further down in
// the basic block.
if (IsLoadChain && BarrierMemoryInstr) {
// The BarrierMemoryInstr is a store that precedes ChainInstr.
assert(OBB.dominates(BarrierMemoryInstr, ChainInstr));
break;
}
}
// Find the largest prefix of Chain whose elements are all in
// ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
// Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
// order.)
SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
unsigned ChainIdx = 0;
for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
break;
}
return Chain.slice(0, ChainIdx);
}
std::pair<InstrListMap, InstrListMap>
Vectorizer::collectInstructions(BasicBlock *BB) {
InstrListMap LoadRefs;
InstrListMap StoreRefs;
for (Instruction &I : *BB) {
if (!I.mayReadOrWriteMemory())
continue;
if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
if (!LI->isSimple())
continue;
// Skip if it's not legal.
if (!TTI.isLegalToVectorizeLoad(LI))
continue;
Type *Ty = LI->getType();
if (!VectorType::isValidElementType(Ty->getScalarType()))
continue;
// Skip weird non-byte sizes. They probably aren't worth the effort of
// handling correctly.
unsigned TySize = DL.getTypeSizeInBits(Ty);
if ((TySize % 8) != 0)
continue;
// Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
// functions are currently using an integer type for the vectorized
// load/store, and does not support casting between the integer type and a
// vector of pointers (e.g. i64 to <2 x i16*>)
if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
continue;
Value *Ptr = LI->getPointerOperand();
unsigned AS = Ptr->getType()->getPointerAddressSpace();
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
unsigned VF = VecRegSize / TySize;
VectorType *VecTy = dyn_cast<VectorType>(Ty);
// No point in looking at these if they're too big to vectorize.
if (TySize > VecRegSize / 2 ||
(VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
continue;
// Make sure all the users of a vector are constant-index extracts.
if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
return EEI && isa<ConstantInt>(EEI->getOperand(1));
}))
continue;
// Save the load locations.
Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
LoadRefs[ObjPtr].push_back(LI);
} else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
if (!SI->isSimple())
continue;
// Skip if it's not legal.
if (!TTI.isLegalToVectorizeStore(SI))
continue;
Type *Ty = SI->getValueOperand()->getType();
if (!VectorType::isValidElementType(Ty->getScalarType()))
continue;
// Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
// functions are currently using an integer type for the vectorized
// load/store, and does not support casting between the integer type and a
// vector of pointers (e.g. i64 to <2 x i16*>)
if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
continue;
// Skip weird non-byte sizes. They probably aren't worth the effort of
// handling correctly.
unsigned TySize = DL.getTypeSizeInBits(Ty);
if ((TySize % 8) != 0)
continue;
Value *Ptr = SI->getPointerOperand();
unsigned AS = Ptr->getType()->getPointerAddressSpace();
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
unsigned VF = VecRegSize / TySize;
VectorType *VecTy = dyn_cast<VectorType>(Ty);
// No point in looking at these if they're too big to vectorize.
if (TySize > VecRegSize / 2 ||
(VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
continue;
if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
return EEI && isa<ConstantInt>(EEI->getOperand(1));
}))
continue;
// Save store location.
Value *ObjPtr = GetUnderlyingObject(Ptr, DL);
StoreRefs[ObjPtr].push_back(SI);
}
}
return {LoadRefs, StoreRefs};
}
bool Vectorizer::vectorizeChains(InstrListMap &Map) {
bool Changed = false;
for (const std::pair<Value *, InstrList> &Chain : Map) {
unsigned Size = Chain.second.size();
if (Size < 2)
continue;
LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
// Process the stores in chunks of 64.
for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
unsigned Len = std::min<unsigned>(CE - CI, 64);
ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
Changed |= vectorizeInstructions(Chunk);
}
}
return Changed;
}
bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
<< " instructions.\n");
SmallVector<int, 16> Heads, Tails;
int ConsecutiveChain[64];
// Do a quadratic search on all of the given loads/stores and find all of the
// pairs of loads/stores that follow each other.
for (int i = 0, e = Instrs.size(); i < e; ++i) {
ConsecutiveChain[i] = -1;
for (int j = e - 1; j >= 0; --j) {
if (i == j)
continue;
if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
if (ConsecutiveChain[i] != -1) {
int CurDistance = std::abs(ConsecutiveChain[i] - i);
int NewDistance = std::abs(ConsecutiveChain[i] - j);
if (j < i || NewDistance > CurDistance)
continue; // Should not insert.
}
Tails.push_back(j);
Heads.push_back(i);
ConsecutiveChain[i] = j;
}
}
}
bool Changed = false;
SmallPtrSet<Instruction *, 16> InstructionsProcessed;
for (int Head : Heads) {
if (InstructionsProcessed.count(Instrs[Head]))
continue;
bool LongerChainExists = false;
for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
if (Head == Tails[TIt] &&
!InstructionsProcessed.count(Instrs[Heads[TIt]])) {
LongerChainExists = true;
break;
}
if (LongerChainExists)
continue;
// We found an instr that starts a chain. Now follow the chain and try to
// vectorize it.
SmallVector<Instruction *, 16> Operands;
int I = Head;
while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
if (InstructionsProcessed.count(Instrs[I]))
break;
Operands.push_back(Instrs[I]);
I = ConsecutiveChain[I];
}
bool Vectorized = false;
if (isa<LoadInst>(*Operands.begin()))
Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
else
Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
Changed |= Vectorized;
}
return Changed;
}
bool Vectorizer::vectorizeStoreChain(
ArrayRef<Instruction *> Chain,
SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
StoreInst *S0 = cast<StoreInst>(Chain[0]);
// If the vector has an int element, default to int for the whole store.
Type *StoreTy;
for (Instruction *I : Chain) {
StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
if (StoreTy->isIntOrIntVectorTy())
break;
if (StoreTy->isPtrOrPtrVectorTy()) {
StoreTy = Type::getIntNTy(F.getParent()->getContext(),
DL.getTypeSizeInBits(StoreTy));
break;
}
}
unsigned Sz = DL.getTypeSizeInBits(StoreTy);
unsigned AS = S0->getPointerAddressSpace();
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
unsigned VF = VecRegSize / Sz;
unsigned ChainSize = Chain.size();
unsigned Alignment = getAlignment(S0);
if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
InstructionsProcessed->insert(Chain.begin(), Chain.end());
return false;
}
ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
if (NewChain.empty()) {
// No vectorization possible.
InstructionsProcessed->insert(Chain.begin(), Chain.end());
return false;
}
if (NewChain.size() == 1) {
// Failed after the first instruction. Discard it and try the smaller chain.
InstructionsProcessed->insert(NewChain.front());
return false;
}
// Update Chain to the valid vectorizable subchain.
Chain = NewChain;
ChainSize = Chain.size();
// Check if it's legal to vectorize this chain. If not, split the chain and
// try again.
unsigned EltSzInBytes = Sz / 8;
unsigned SzInBytes = EltSzInBytes * ChainSize;
if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
auto Chains = splitOddVectorElts(Chain, Sz);
return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
vectorizeStoreChain(Chains.second, InstructionsProcessed);
}
VectorType *VecTy;
VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
if (VecStoreTy)
VecTy = VectorType::get(StoreTy->getScalarType(),
Chain.size() * VecStoreTy->getNumElements());
else
VecTy = VectorType::get(StoreTy, Chain.size());
// If it's more than the max vector size or the target has a better
// vector factor, break it into two pieces.
unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
" Creating two separate arrays.\n");
return vectorizeStoreChain(Chain.slice(0, TargetVF),
InstructionsProcessed) |
vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
}
LLVM_DEBUG({
dbgs() << "LSV: Stores to vectorize:\n";
for (Instruction *I : Chain)
dbgs() << " " << *I << "\n";
});
// We won't try again to vectorize the elements of the chain, regardless of
// whether we succeed below.
InstructionsProcessed->insert(Chain.begin(), Chain.end());
// If the store is going to be misaligned, don't vectorize it.
if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
if (S0->getPointerAddressSpace() != 0)
return false;
unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
StackAdjustedAlignment,
DL, S0, nullptr, &DT);
if (NewAlign < StackAdjustedAlignment)
return false;
}
BasicBlock::iterator First, Last;
std::tie(First, Last) = getBoundaryInstrs(Chain);
Builder.SetInsertPoint(&*Last);
Value *Vec = UndefValue::get(VecTy);
if (VecStoreTy) {
unsigned VecWidth = VecStoreTy->getNumElements();
for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
StoreInst *Store = cast<StoreInst>(Chain[I]);
for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
unsigned NewIdx = J + I * VecWidth;
Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
Builder.getInt32(J));
if (Extract->getType() != StoreTy->getScalarType())
Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
Value *Insert =
Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
Vec = Insert;
}
}
} else {
for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
StoreInst *Store = cast<StoreInst>(Chain[I]);
Value *Extract = Store->getValueOperand();
if (Extract->getType() != StoreTy->getScalarType())
Extract =
Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
Value *Insert =
Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
Vec = Insert;
}
}
// This cast is safe because Builder.CreateStore() always creates a bona fide
// StoreInst.
StoreInst *SI = cast<StoreInst>(
Builder.CreateStore(Vec, Builder.CreateBitCast(S0->getPointerOperand(),
VecTy->getPointerTo(AS))));
propagateMetadata(SI, Chain);
SI->setAlignment(Alignment);
eraseInstructions(Chain);
++NumVectorInstructions;
NumScalarsVectorized += Chain.size();
return true;
}
bool Vectorizer::vectorizeLoadChain(
ArrayRef<Instruction *> Chain,
SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
LoadInst *L0 = cast<LoadInst>(Chain[0]);
// If the vector has an int element, default to int for the whole load.
Type *LoadTy;
for (const auto &V : Chain) {
LoadTy = cast<LoadInst>(V)->getType();
if (LoadTy->isIntOrIntVectorTy())
break;
if (LoadTy->isPtrOrPtrVectorTy()) {
LoadTy = Type::getIntNTy(F.getParent()->getContext(),
DL.getTypeSizeInBits(LoadTy));
break;
}
}
unsigned Sz = DL.getTypeSizeInBits(LoadTy);
unsigned AS = L0->getPointerAddressSpace();
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
unsigned VF = VecRegSize / Sz;
unsigned ChainSize = Chain.size();
unsigned Alignment = getAlignment(L0);
if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
InstructionsProcessed->insert(Chain.begin(), Chain.end());
return false;
}
ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
if (NewChain.empty()) {
// No vectorization possible.
InstructionsProcessed->insert(Chain.begin(), Chain.end());
return false;
}
if (NewChain.size() == 1) {
// Failed after the first instruction. Discard it and try the smaller chain.
InstructionsProcessed->insert(NewChain.front());
return false;
}
// Update Chain to the valid vectorizable subchain.
Chain = NewChain;
ChainSize = Chain.size();
// Check if it's legal to vectorize this chain. If not, split the chain and
// try again.
unsigned EltSzInBytes = Sz / 8;
unsigned SzInBytes = EltSzInBytes * ChainSize;
if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
auto Chains = splitOddVectorElts(Chain, Sz);
return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
vectorizeLoadChain(Chains.second, InstructionsProcessed);
}
VectorType *VecTy;
VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
if (VecLoadTy)
VecTy = VectorType::get(LoadTy->getScalarType(),
Chain.size() * VecLoadTy->getNumElements());
else
VecTy = VectorType::get(LoadTy, Chain.size());
// If it's more than the max vector size or the target has a better
// vector factor, break it into two pieces.
unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
" Creating two separate arrays.\n");
return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
}
// We won't try again to vectorize the elements of the chain, regardless of
// whether we succeed below.
InstructionsProcessed->insert(Chain.begin(), Chain.end());
// If the load is going to be misaligned, don't vectorize it.
if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
if (L0->getPointerAddressSpace() != 0)
return false;
unsigned NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
StackAdjustedAlignment,
DL, L0, nullptr, &DT);
if (NewAlign < StackAdjustedAlignment)
return false;
Alignment = NewAlign;
}
LLVM_DEBUG({
dbgs() << "LSV: Loads to vectorize:\n";
for (Instruction *I : Chain)
I->dump();
});
// getVectorizablePrefix already computed getBoundaryInstrs. The value of
// Last may have changed since then, but the value of First won't have. If it
// matters, we could compute getBoundaryInstrs only once and reuse it here.
BasicBlock::iterator First, Last;
std::tie(First, Last) = getBoundaryInstrs(Chain);
Builder.SetInsertPoint(&*First);
Value *Bitcast =
Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
// This cast is safe because Builder.CreateLoad always creates a bona fide
// LoadInst.
LoadInst *LI = cast<LoadInst>(Builder.CreateLoad(Bitcast));
propagateMetadata(LI, Chain);
LI->setAlignment(Alignment);
if (VecLoadTy) {
SmallVector<Instruction *, 16> InstrsToErase;
unsigned VecWidth = VecLoadTy->getNumElements();
for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
for (auto Use : Chain[I]->users()) {
// All users of vector loads are ExtractElement instructions with
// constant indices, otherwise we would have bailed before now.
Instruction *UI = cast<Instruction>(Use);
unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
unsigned NewIdx = Idx + I * VecWidth;
Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
UI->getName());
if (V->getType() != UI->getType())
V = Builder.CreateBitCast(V, UI->getType());
// Replace the old instruction.
UI->replaceAllUsesWith(V);
InstrsToErase.push_back(UI);
}
}
// Bitcast might not be an Instruction, if the value being loaded is a
// constant. In that case, no need to reorder anything.
if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
reorder(BitcastInst);
for (auto I : InstrsToErase)
I->eraseFromParent();
} else {
for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
Value *CV = Chain[I];
Value *V =
Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
if (V->getType() != CV->getType()) {
V = Builder.CreateBitOrPointerCast(V, CV->getType());
}
// Replace the old instruction.
CV->replaceAllUsesWith(V);
}
if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
reorder(BitcastInst);
}
eraseInstructions(Chain);
++NumVectorInstructions;
NumScalarsVectorized += Chain.size();
return true;
}
bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
unsigned Alignment) {
if (Alignment % SzInBytes == 0)
return false;
bool Fast = false;
bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
SzInBytes * 8, AddressSpace,
Alignment, &Fast);
LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
<< " and fast? " << Fast << "\n";);
return !Allows || !Fast;
}