1
0
mirror of https://github.com/RPCS3/llvm-mirror.git synced 2024-10-19 11:02:59 +02:00
llvm-mirror/lib/Analysis/VectorUtils.cpp

981 lines
37 KiB
C++
Raw Normal View History

//===----------- VectorUtils.cpp - Vectorizer utility functions -----------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file defines vectorizer utilities.
//
//===----------------------------------------------------------------------===//
#include "llvm/Analysis/VectorUtils.h"
[LoopVectorize] Shrink integer operations into the smallest type possible C semantics force sub-int-sized values (e.g. i8, i16) to be promoted to int type (e.g. i32) whenever arithmetic is performed on them. For targets with native i8 or i16 operations, usually InstCombine can shrink the arithmetic type down again. However InstCombine refuses to create illegal types, so for targets without i8 or i16 registers, the lengthening and shrinking remains. Most SIMD ISAs (e.g. NEON) however support vectors of i8 or i16 even when their scalar equivalents do not, so during vectorization it is important to remove these lengthens and truncates when deciding the profitability of vectorization. The algorithm this uses starts at truncs and icmps, trawling their use-def chains until they terminate or instructions outside the loop are found (or unsafe instructions like inttoptr casts are found). If the use-def chains starting from different root instructions (truncs/icmps) meet, they are unioned. The demanded bits of each node in the graph are ORed together to form an overall mask of the demanded bits in the entire graph. The minimum bitwidth that graph can be truncated to is the bitwidth minus the number of leading zeroes in the overall mask. The intention is that this algorithm should "first do no harm", so it will never insert extra cast instructions. This is why the use-def graphs are unioned, so that subgraphs with different minimum bitwidths do not need casts inserted between them. This algorithm works hard to reduce compile time impact. DemandedBits are only queried if there are extends of illegal types and if a truncate to an illegal type is seen. In the general case, this results in a simple linear scan of the instructions in the loop. No non-noise compile time impact was seen on a clang bootstrap build. llvm-svn: 250032
2015-10-12 14:34:45 +02:00
#include "llvm/ADT/EquivalenceClasses.h"
#include "llvm/Analysis/DemandedBits.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
[LoopVectorize] Shrink integer operations into the smallest type possible C semantics force sub-int-sized values (e.g. i8, i16) to be promoted to int type (e.g. i32) whenever arithmetic is performed on them. For targets with native i8 or i16 operations, usually InstCombine can shrink the arithmetic type down again. However InstCombine refuses to create illegal types, so for targets without i8 or i16 registers, the lengthening and shrinking remains. Most SIMD ISAs (e.g. NEON) however support vectors of i8 or i16 even when their scalar equivalents do not, so during vectorization it is important to remove these lengthens and truncates when deciding the profitability of vectorization. The algorithm this uses starts at truncs and icmps, trawling their use-def chains until they terminate or instructions outside the loop are found (or unsafe instructions like inttoptr casts are found). If the use-def chains starting from different root instructions (truncs/icmps) meet, they are unioned. The demanded bits of each node in the graph are ORed together to form an overall mask of the demanded bits in the entire graph. The minimum bitwidth that graph can be truncated to is the bitwidth minus the number of leading zeroes in the overall mask. The intention is that this algorithm should "first do no harm", so it will never insert extra cast instructions. This is why the use-def graphs are unioned, so that subgraphs with different minimum bitwidths do not need casts inserted between them. This algorithm works hard to reduce compile time impact. DemandedBits are only queried if there are extends of illegal types and if a truncate to an illegal type is seen. In the general case, this results in a simple linear scan of the instructions in the loop. No non-noise compile time impact was seen on a clang bootstrap build. llvm-svn: 250032
2015-10-12 14:34:45 +02:00
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/GetElementPtrTypeIterator.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/IR/Value.h"
#define DEBUG_TYPE "vectorutils"
using namespace llvm;
using namespace llvm::PatternMatch;
/// Maximum factor for an interleaved memory access.
static cl::opt<unsigned> MaxInterleaveGroupFactor(
"max-interleave-group-factor", cl::Hidden,
cl::desc("Maximum factor for an interleaved access group (default = 8)"),
cl::init(8));
/// Return true if all of the intrinsic's arguments and return type are scalars
/// for the scalar form of the intrinsic and vectors for the vector form of the
/// intrinsic.
bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
switch (ID) {
case Intrinsic::bswap: // Begin integer bit-manipulation.
case Intrinsic::bitreverse:
case Intrinsic::ctpop:
case Intrinsic::ctlz:
case Intrinsic::cttz:
case Intrinsic::fshl:
case Intrinsic::fshr:
case Intrinsic::sqrt: // Begin floating-point.
case Intrinsic::sin:
case Intrinsic::cos:
case Intrinsic::exp:
case Intrinsic::exp2:
case Intrinsic::log:
case Intrinsic::log10:
case Intrinsic::log2:
case Intrinsic::fabs:
case Intrinsic::minnum:
case Intrinsic::maxnum:
case Intrinsic::minimum:
case Intrinsic::maximum:
case Intrinsic::copysign:
case Intrinsic::floor:
case Intrinsic::ceil:
case Intrinsic::trunc:
case Intrinsic::rint:
case Intrinsic::nearbyint:
case Intrinsic::round:
case Intrinsic::pow:
case Intrinsic::fma:
case Intrinsic::fmuladd:
case Intrinsic::powi:
case Intrinsic::canonicalize:
return true;
default:
return false;
}
}
/// Identifies if the intrinsic has a scalar operand. It check for
/// ctlz,cttz and powi special intrinsics whose argument is scalar.
bool llvm::hasVectorInstrinsicScalarOpd(Intrinsic::ID ID,
unsigned ScalarOpdIdx) {
switch (ID) {
case Intrinsic::ctlz:
case Intrinsic::cttz:
case Intrinsic::powi:
return (ScalarOpdIdx == 1);
default:
return false;
}
}
/// Returns intrinsic ID for call.
/// For the input call instruction it finds mapping intrinsic and returns
/// its ID, in case it does not found it return not_intrinsic.
Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
const TargetLibraryInfo *TLI) {
Intrinsic::ID ID = getIntrinsicForCallSite(CI, TLI);
if (ID == Intrinsic::not_intrinsic)
return Intrinsic::not_intrinsic;
if (isTriviallyVectorizable(ID) || ID == Intrinsic::lifetime_start ||
ID == Intrinsic::lifetime_end || ID == Intrinsic::assume ||
ID == Intrinsic::sideeffect)
return ID;
return Intrinsic::not_intrinsic;
}
/// Find the operand of the GEP that should be checked for consecutive
/// stores. This ignores trailing indices that have no effect on the final
/// pointer.
unsigned llvm::getGEPInductionOperand(const GetElementPtrInst *Gep) {
const DataLayout &DL = Gep->getModule()->getDataLayout();
unsigned LastOperand = Gep->getNumOperands() - 1;
unsigned GEPAllocSize = DL.getTypeAllocSize(Gep->getResultElementType());
// Walk backwards and try to peel off zeros.
while (LastOperand > 1 && match(Gep->getOperand(LastOperand), m_Zero())) {
// Find the type we're currently indexing into.
gep_type_iterator GEPTI = gep_type_begin(Gep);
std::advance(GEPTI, LastOperand - 2);
// If it's a type with the same allocation size as the result of the GEP we
// can peel off the zero index.
if (DL.getTypeAllocSize(GEPTI.getIndexedType()) != GEPAllocSize)
break;
--LastOperand;
}
return LastOperand;
}
/// If the argument is a GEP, then returns the operand identified by
/// getGEPInductionOperand. However, if there is some other non-loop-invariant
/// operand, it returns that instead.
Value *llvm::stripGetElementPtr(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
if (!GEP)
return Ptr;
unsigned InductionOperand = getGEPInductionOperand(GEP);
// Check that all of the gep indices are uniform except for our induction
// operand.
for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i)
if (i != InductionOperand &&
!SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp))
return Ptr;
return GEP->getOperand(InductionOperand);
}
/// If a value has only one user that is a CastInst, return it.
Value *llvm::getUniqueCastUse(Value *Ptr, Loop *Lp, Type *Ty) {
Value *UniqueCast = nullptr;
for (User *U : Ptr->users()) {
CastInst *CI = dyn_cast<CastInst>(U);
if (CI && CI->getType() == Ty) {
if (!UniqueCast)
UniqueCast = CI;
else
return nullptr;
}
}
return UniqueCast;
}
/// Get the stride of a pointer access in a loop. Looks for symbolic
/// strides "a[i*stride]". Returns the symbolic stride, or null otherwise.
Value *llvm::getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
auto *PtrTy = dyn_cast<PointerType>(Ptr->getType());
if (!PtrTy || PtrTy->isAggregateType())
return nullptr;
// Try to remove a gep instruction to make the pointer (actually index at this
// point) easier analyzable. If OrigPtr is equal to Ptr we are analyzing the
// pointer, otherwise, we are analyzing the index.
Value *OrigPtr = Ptr;
// The size of the pointer access.
int64_t PtrAccessSize = 1;
Ptr = stripGetElementPtr(Ptr, SE, Lp);
const SCEV *V = SE->getSCEV(Ptr);
if (Ptr != OrigPtr)
// Strip off casts.
while (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V))
V = C->getOperand();
const SCEVAddRecExpr *S = dyn_cast<SCEVAddRecExpr>(V);
if (!S)
return nullptr;
V = S->getStepRecurrence(*SE);
if (!V)
return nullptr;
// Strip off the size of access multiplication if we are still analyzing the
// pointer.
if (OrigPtr == Ptr) {
if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(V)) {
if (M->getOperand(0)->getSCEVType() != scConstant)
return nullptr;
const APInt &APStepVal = cast<SCEVConstant>(M->getOperand(0))->getAPInt();
// Huge step value - give up.
if (APStepVal.getBitWidth() > 64)
return nullptr;
int64_t StepVal = APStepVal.getSExtValue();
if (PtrAccessSize != StepVal)
return nullptr;
V = M->getOperand(1);
}
}
// Strip off casts.
Type *StripedOffRecurrenceCast = nullptr;
if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(V)) {
StripedOffRecurrenceCast = C->getType();
V = C->getOperand();
}
// Look for the loop invariant symbolic value.
const SCEVUnknown *U = dyn_cast<SCEVUnknown>(V);
if (!U)
return nullptr;
Value *Stride = U->getValue();
if (!Lp->isLoopInvariant(Stride))
return nullptr;
// If we have stripped off the recurrence cast we have to make sure that we
// return the value that is used in this loop so that we can replace it later.
if (StripedOffRecurrenceCast)
Stride = getUniqueCastUse(Stride, Lp, StripedOffRecurrenceCast);
return Stride;
}
/// Given a vector and an element number, see if the scalar value is
/// already around as a register, for example if it were inserted then extracted
/// from the vector.
Value *llvm::findScalarElement(Value *V, unsigned EltNo) {
assert(V->getType()->isVectorTy() && "Not looking at a vector?");
VectorType *VTy = cast<VectorType>(V->getType());
unsigned Width = VTy->getNumElements();
if (EltNo >= Width) // Out of range access.
return UndefValue::get(VTy->getElementType());
if (Constant *C = dyn_cast<Constant>(V))
return C->getAggregateElement(EltNo);
if (InsertElementInst *III = dyn_cast<InsertElementInst>(V)) {
// If this is an insert to a variable element, we don't know what it is.
if (!isa<ConstantInt>(III->getOperand(2)))
return nullptr;
unsigned IIElt = cast<ConstantInt>(III->getOperand(2))->getZExtValue();
// If this is an insert to the element we are looking for, return the
// inserted value.
if (EltNo == IIElt)
return III->getOperand(1);
// Otherwise, the insertelement doesn't modify the value, recurse on its
// vector input.
return findScalarElement(III->getOperand(0), EltNo);
}
if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V)) {
unsigned LHSWidth = SVI->getOperand(0)->getType()->getVectorNumElements();
int InEl = SVI->getMaskValue(EltNo);
if (InEl < 0)
return UndefValue::get(VTy->getElementType());
if (InEl < (int)LHSWidth)
return findScalarElement(SVI->getOperand(0), InEl);
return findScalarElement(SVI->getOperand(1), InEl - LHSWidth);
}
// Extract a value from a vector add operation with a constant zero.
// TODO: Use getBinOpIdentity() to generalize this.
Value *Val; Constant *C;
if (match(V, m_Add(m_Value(Val), m_Constant(C))))
if (Constant *Elt = C->getAggregateElement(EltNo))
if (Elt->isNullValue())
return findScalarElement(Val, EltNo);
// Otherwise, we don't know.
return nullptr;
}
/// Get splat value if the input is a splat vector or return nullptr.
/// This function is not fully general. It checks only 2 cases:
/// the input value is (1) a splat constants vector or (2) a sequence
/// of instructions that broadcast a single value into a vector.
///
const llvm::Value *llvm::getSplatValue(const Value *V) {
if (auto *C = dyn_cast<Constant>(V))
if (isa<VectorType>(V->getType()))
return C->getSplatValue();
auto *ShuffleInst = dyn_cast<ShuffleVectorInst>(V);
if (!ShuffleInst)
return nullptr;
// All-zero (or undef) shuffle mask elements.
for (int MaskElt : ShuffleInst->getShuffleMask())
if (MaskElt != 0 && MaskElt != -1)
return nullptr;
// The first shuffle source is 'insertelement' with index 0.
auto *InsertEltInst =
dyn_cast<InsertElementInst>(ShuffleInst->getOperand(0));
if (!InsertEltInst || !isa<ConstantInt>(InsertEltInst->getOperand(2)) ||
!cast<ConstantInt>(InsertEltInst->getOperand(2))->isZero())
return nullptr;
return InsertEltInst->getOperand(1);
}
[LoopVectorize] Shrink integer operations into the smallest type possible C semantics force sub-int-sized values (e.g. i8, i16) to be promoted to int type (e.g. i32) whenever arithmetic is performed on them. For targets with native i8 or i16 operations, usually InstCombine can shrink the arithmetic type down again. However InstCombine refuses to create illegal types, so for targets without i8 or i16 registers, the lengthening and shrinking remains. Most SIMD ISAs (e.g. NEON) however support vectors of i8 or i16 even when their scalar equivalents do not, so during vectorization it is important to remove these lengthens and truncates when deciding the profitability of vectorization. The algorithm this uses starts at truncs and icmps, trawling their use-def chains until they terminate or instructions outside the loop are found (or unsafe instructions like inttoptr casts are found). If the use-def chains starting from different root instructions (truncs/icmps) meet, they are unioned. The demanded bits of each node in the graph are ORed together to form an overall mask of the demanded bits in the entire graph. The minimum bitwidth that graph can be truncated to is the bitwidth minus the number of leading zeroes in the overall mask. The intention is that this algorithm should "first do no harm", so it will never insert extra cast instructions. This is why the use-def graphs are unioned, so that subgraphs with different minimum bitwidths do not need casts inserted between them. This algorithm works hard to reduce compile time impact. DemandedBits are only queried if there are extends of illegal types and if a truncate to an illegal type is seen. In the general case, this results in a simple linear scan of the instructions in the loop. No non-noise compile time impact was seen on a clang bootstrap build. llvm-svn: 250032
2015-10-12 14:34:45 +02:00
MapVector<Instruction *, uint64_t>
llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
const TargetTransformInfo *TTI) {
[LoopVectorize] Shrink integer operations into the smallest type possible C semantics force sub-int-sized values (e.g. i8, i16) to be promoted to int type (e.g. i32) whenever arithmetic is performed on them. For targets with native i8 or i16 operations, usually InstCombine can shrink the arithmetic type down again. However InstCombine refuses to create illegal types, so for targets without i8 or i16 registers, the lengthening and shrinking remains. Most SIMD ISAs (e.g. NEON) however support vectors of i8 or i16 even when their scalar equivalents do not, so during vectorization it is important to remove these lengthens and truncates when deciding the profitability of vectorization. The algorithm this uses starts at truncs and icmps, trawling their use-def chains until they terminate or instructions outside the loop are found (or unsafe instructions like inttoptr casts are found). If the use-def chains starting from different root instructions (truncs/icmps) meet, they are unioned. The demanded bits of each node in the graph are ORed together to form an overall mask of the demanded bits in the entire graph. The minimum bitwidth that graph can be truncated to is the bitwidth minus the number of leading zeroes in the overall mask. The intention is that this algorithm should "first do no harm", so it will never insert extra cast instructions. This is why the use-def graphs are unioned, so that subgraphs with different minimum bitwidths do not need casts inserted between them. This algorithm works hard to reduce compile time impact. DemandedBits are only queried if there are extends of illegal types and if a truncate to an illegal type is seen. In the general case, this results in a simple linear scan of the instructions in the loop. No non-noise compile time impact was seen on a clang bootstrap build. llvm-svn: 250032
2015-10-12 14:34:45 +02:00
// DemandedBits will give us every value's live-out bits. But we want
// to ensure no extra casts would need to be inserted, so every DAG
// of connected values must have the same minimum bitwidth.
EquivalenceClasses<Value *> ECs;
SmallVector<Value *, 16> Worklist;
SmallPtrSet<Value *, 4> Roots;
SmallPtrSet<Value *, 16> Visited;
DenseMap<Value *, uint64_t> DBits;
SmallPtrSet<Instruction *, 4> InstructionSet;
MapVector<Instruction *, uint64_t> MinBWs;
[LoopVectorize] Shrink integer operations into the smallest type possible C semantics force sub-int-sized values (e.g. i8, i16) to be promoted to int type (e.g. i32) whenever arithmetic is performed on them. For targets with native i8 or i16 operations, usually InstCombine can shrink the arithmetic type down again. However InstCombine refuses to create illegal types, so for targets without i8 or i16 registers, the lengthening and shrinking remains. Most SIMD ISAs (e.g. NEON) however support vectors of i8 or i16 even when their scalar equivalents do not, so during vectorization it is important to remove these lengthens and truncates when deciding the profitability of vectorization. The algorithm this uses starts at truncs and icmps, trawling their use-def chains until they terminate or instructions outside the loop are found (or unsafe instructions like inttoptr casts are found). If the use-def chains starting from different root instructions (truncs/icmps) meet, they are unioned. The demanded bits of each node in the graph are ORed together to form an overall mask of the demanded bits in the entire graph. The minimum bitwidth that graph can be truncated to is the bitwidth minus the number of leading zeroes in the overall mask. The intention is that this algorithm should "first do no harm", so it will never insert extra cast instructions. This is why the use-def graphs are unioned, so that subgraphs with different minimum bitwidths do not need casts inserted between them. This algorithm works hard to reduce compile time impact. DemandedBits are only queried if there are extends of illegal types and if a truncate to an illegal type is seen. In the general case, this results in a simple linear scan of the instructions in the loop. No non-noise compile time impact was seen on a clang bootstrap build. llvm-svn: 250032
2015-10-12 14:34:45 +02:00
// Determine the roots. We work bottom-up, from truncs or icmps.
bool SeenExtFromIllegalType = false;
for (auto *BB : Blocks)
for (auto &I : *BB) {
InstructionSet.insert(&I);
if (TTI && (isa<ZExtInst>(&I) || isa<SExtInst>(&I)) &&
!TTI->isTypeLegal(I.getOperand(0)->getType()))
SeenExtFromIllegalType = true;
[LoopVectorize] Shrink integer operations into the smallest type possible C semantics force sub-int-sized values (e.g. i8, i16) to be promoted to int type (e.g. i32) whenever arithmetic is performed on them. For targets with native i8 or i16 operations, usually InstCombine can shrink the arithmetic type down again. However InstCombine refuses to create illegal types, so for targets without i8 or i16 registers, the lengthening and shrinking remains. Most SIMD ISAs (e.g. NEON) however support vectors of i8 or i16 even when their scalar equivalents do not, so during vectorization it is important to remove these lengthens and truncates when deciding the profitability of vectorization. The algorithm this uses starts at truncs and icmps, trawling their use-def chains until they terminate or instructions outside the loop are found (or unsafe instructions like inttoptr casts are found). If the use-def chains starting from different root instructions (truncs/icmps) meet, they are unioned. The demanded bits of each node in the graph are ORed together to form an overall mask of the demanded bits in the entire graph. The minimum bitwidth that graph can be truncated to is the bitwidth minus the number of leading zeroes in the overall mask. The intention is that this algorithm should "first do no harm", so it will never insert extra cast instructions. This is why the use-def graphs are unioned, so that subgraphs with different minimum bitwidths do not need casts inserted between them. This algorithm works hard to reduce compile time impact. DemandedBits are only queried if there are extends of illegal types and if a truncate to an illegal type is seen. In the general case, this results in a simple linear scan of the instructions in the loop. No non-noise compile time impact was seen on a clang bootstrap build. llvm-svn: 250032
2015-10-12 14:34:45 +02:00
// Only deal with non-vector integers up to 64-bits wide.
if ((isa<TruncInst>(&I) || isa<ICmpInst>(&I)) &&
!I.getType()->isVectorTy() &&
I.getOperand(0)->getType()->getScalarSizeInBits() <= 64) {
// Don't make work for ourselves. If we know the loaded type is legal,
// don't add it to the worklist.
if (TTI && isa<TruncInst>(&I) && TTI->isTypeLegal(I.getType()))
continue;
[LoopVectorize] Shrink integer operations into the smallest type possible C semantics force sub-int-sized values (e.g. i8, i16) to be promoted to int type (e.g. i32) whenever arithmetic is performed on them. For targets with native i8 or i16 operations, usually InstCombine can shrink the arithmetic type down again. However InstCombine refuses to create illegal types, so for targets without i8 or i16 registers, the lengthening and shrinking remains. Most SIMD ISAs (e.g. NEON) however support vectors of i8 or i16 even when their scalar equivalents do not, so during vectorization it is important to remove these lengthens and truncates when deciding the profitability of vectorization. The algorithm this uses starts at truncs and icmps, trawling their use-def chains until they terminate or instructions outside the loop are found (or unsafe instructions like inttoptr casts are found). If the use-def chains starting from different root instructions (truncs/icmps) meet, they are unioned. The demanded bits of each node in the graph are ORed together to form an overall mask of the demanded bits in the entire graph. The minimum bitwidth that graph can be truncated to is the bitwidth minus the number of leading zeroes in the overall mask. The intention is that this algorithm should "first do no harm", so it will never insert extra cast instructions. This is why the use-def graphs are unioned, so that subgraphs with different minimum bitwidths do not need casts inserted between them. This algorithm works hard to reduce compile time impact. DemandedBits are only queried if there are extends of illegal types and if a truncate to an illegal type is seen. In the general case, this results in a simple linear scan of the instructions in the loop. No non-noise compile time impact was seen on a clang bootstrap build. llvm-svn: 250032
2015-10-12 14:34:45 +02:00
Worklist.push_back(&I);
Roots.insert(&I);
}
}
// Early exit.
if (Worklist.empty() || (TTI && !SeenExtFromIllegalType))
return MinBWs;
[LoopVectorize] Shrink integer operations into the smallest type possible C semantics force sub-int-sized values (e.g. i8, i16) to be promoted to int type (e.g. i32) whenever arithmetic is performed on them. For targets with native i8 or i16 operations, usually InstCombine can shrink the arithmetic type down again. However InstCombine refuses to create illegal types, so for targets without i8 or i16 registers, the lengthening and shrinking remains. Most SIMD ISAs (e.g. NEON) however support vectors of i8 or i16 even when their scalar equivalents do not, so during vectorization it is important to remove these lengthens and truncates when deciding the profitability of vectorization. The algorithm this uses starts at truncs and icmps, trawling their use-def chains until they terminate or instructions outside the loop are found (or unsafe instructions like inttoptr casts are found). If the use-def chains starting from different root instructions (truncs/icmps) meet, they are unioned. The demanded bits of each node in the graph are ORed together to form an overall mask of the demanded bits in the entire graph. The minimum bitwidth that graph can be truncated to is the bitwidth minus the number of leading zeroes in the overall mask. The intention is that this algorithm should "first do no harm", so it will never insert extra cast instructions. This is why the use-def graphs are unioned, so that subgraphs with different minimum bitwidths do not need casts inserted between them. This algorithm works hard to reduce compile time impact. DemandedBits are only queried if there are extends of illegal types and if a truncate to an illegal type is seen. In the general case, this results in a simple linear scan of the instructions in the loop. No non-noise compile time impact was seen on a clang bootstrap build. llvm-svn: 250032
2015-10-12 14:34:45 +02:00
// Now proceed breadth-first, unioning values together.
while (!Worklist.empty()) {
Value *Val = Worklist.pop_back_val();
Value *Leader = ECs.getOrInsertLeaderValue(Val);
[LoopVectorize] Shrink integer operations into the smallest type possible C semantics force sub-int-sized values (e.g. i8, i16) to be promoted to int type (e.g. i32) whenever arithmetic is performed on them. For targets with native i8 or i16 operations, usually InstCombine can shrink the arithmetic type down again. However InstCombine refuses to create illegal types, so for targets without i8 or i16 registers, the lengthening and shrinking remains. Most SIMD ISAs (e.g. NEON) however support vectors of i8 or i16 even when their scalar equivalents do not, so during vectorization it is important to remove these lengthens and truncates when deciding the profitability of vectorization. The algorithm this uses starts at truncs and icmps, trawling their use-def chains until they terminate or instructions outside the loop are found (or unsafe instructions like inttoptr casts are found). If the use-def chains starting from different root instructions (truncs/icmps) meet, they are unioned. The demanded bits of each node in the graph are ORed together to form an overall mask of the demanded bits in the entire graph. The minimum bitwidth that graph can be truncated to is the bitwidth minus the number of leading zeroes in the overall mask. The intention is that this algorithm should "first do no harm", so it will never insert extra cast instructions. This is why the use-def graphs are unioned, so that subgraphs with different minimum bitwidths do not need casts inserted between them. This algorithm works hard to reduce compile time impact. DemandedBits are only queried if there are extends of illegal types and if a truncate to an illegal type is seen. In the general case, this results in a simple linear scan of the instructions in the loop. No non-noise compile time impact was seen on a clang bootstrap build. llvm-svn: 250032
2015-10-12 14:34:45 +02:00
if (Visited.count(Val))
continue;
Visited.insert(Val);
// Non-instructions terminate a chain successfully.
if (!isa<Instruction>(Val))
continue;
Instruction *I = cast<Instruction>(Val);
// If we encounter a type that is larger than 64 bits, we can't represent
// it so bail out.
if (DB.getDemandedBits(I).getBitWidth() > 64)
return MapVector<Instruction *, uint64_t>();
uint64_t V = DB.getDemandedBits(I).getZExtValue();
DBits[Leader] |= V;
DBits[I] = V;
[LoopVectorize] Shrink integer operations into the smallest type possible C semantics force sub-int-sized values (e.g. i8, i16) to be promoted to int type (e.g. i32) whenever arithmetic is performed on them. For targets with native i8 or i16 operations, usually InstCombine can shrink the arithmetic type down again. However InstCombine refuses to create illegal types, so for targets without i8 or i16 registers, the lengthening and shrinking remains. Most SIMD ISAs (e.g. NEON) however support vectors of i8 or i16 even when their scalar equivalents do not, so during vectorization it is important to remove these lengthens and truncates when deciding the profitability of vectorization. The algorithm this uses starts at truncs and icmps, trawling their use-def chains until they terminate or instructions outside the loop are found (or unsafe instructions like inttoptr casts are found). If the use-def chains starting from different root instructions (truncs/icmps) meet, they are unioned. The demanded bits of each node in the graph are ORed together to form an overall mask of the demanded bits in the entire graph. The minimum bitwidth that graph can be truncated to is the bitwidth minus the number of leading zeroes in the overall mask. The intention is that this algorithm should "first do no harm", so it will never insert extra cast instructions. This is why the use-def graphs are unioned, so that subgraphs with different minimum bitwidths do not need casts inserted between them. This algorithm works hard to reduce compile time impact. DemandedBits are only queried if there are extends of illegal types and if a truncate to an illegal type is seen. In the general case, this results in a simple linear scan of the instructions in the loop. No non-noise compile time impact was seen on a clang bootstrap build. llvm-svn: 250032
2015-10-12 14:34:45 +02:00
// Casts, loads and instructions outside of our range terminate a chain
// successfully.
if (isa<SExtInst>(I) || isa<ZExtInst>(I) || isa<LoadInst>(I) ||
!InstructionSet.count(I))
continue;
// Unsafe casts terminate a chain unsuccessfully. We can't do anything
// useful with bitcasts, ptrtoints or inttoptrs and it'd be unsafe to
// transform anything that relies on them.
if (isa<BitCastInst>(I) || isa<PtrToIntInst>(I) || isa<IntToPtrInst>(I) ||
!I->getType()->isIntegerTy()) {
DBits[Leader] |= ~0ULL;
continue;
}
// We don't modify the types of PHIs. Reductions will already have been
// truncated if possible, and inductions' sizes will have been chosen by
// indvars.
if (isa<PHINode>(I))
continue;
if (DBits[Leader] == ~0ULL)
// All bits demanded, no point continuing.
continue;
for (Value *O : cast<User>(I)->operands()) {
ECs.unionSets(Leader, O);
Worklist.push_back(O);
}
}
// Now we've discovered all values, walk them to see if there are
// any users we didn't see. If there are, we can't optimize that
// chain.
for (auto &I : DBits)
for (auto *U : I.first->users())
if (U->getType()->isIntegerTy() && DBits.count(U) == 0)
DBits[ECs.getOrInsertLeaderValue(I.first)] |= ~0ULL;
[LoopVectorize] Shrink integer operations into the smallest type possible C semantics force sub-int-sized values (e.g. i8, i16) to be promoted to int type (e.g. i32) whenever arithmetic is performed on them. For targets with native i8 or i16 operations, usually InstCombine can shrink the arithmetic type down again. However InstCombine refuses to create illegal types, so for targets without i8 or i16 registers, the lengthening and shrinking remains. Most SIMD ISAs (e.g. NEON) however support vectors of i8 or i16 even when their scalar equivalents do not, so during vectorization it is important to remove these lengthens and truncates when deciding the profitability of vectorization. The algorithm this uses starts at truncs and icmps, trawling their use-def chains until they terminate or instructions outside the loop are found (or unsafe instructions like inttoptr casts are found). If the use-def chains starting from different root instructions (truncs/icmps) meet, they are unioned. The demanded bits of each node in the graph are ORed together to form an overall mask of the demanded bits in the entire graph. The minimum bitwidth that graph can be truncated to is the bitwidth minus the number of leading zeroes in the overall mask. The intention is that this algorithm should "first do no harm", so it will never insert extra cast instructions. This is why the use-def graphs are unioned, so that subgraphs with different minimum bitwidths do not need casts inserted between them. This algorithm works hard to reduce compile time impact. DemandedBits are only queried if there are extends of illegal types and if a truncate to an illegal type is seen. In the general case, this results in a simple linear scan of the instructions in the loop. No non-noise compile time impact was seen on a clang bootstrap build. llvm-svn: 250032
2015-10-12 14:34:45 +02:00
for (auto I = ECs.begin(), E = ECs.end(); I != E; ++I) {
uint64_t LeaderDemandedBits = 0;
for (auto MI = ECs.member_begin(I), ME = ECs.member_end(); MI != ME; ++MI)
LeaderDemandedBits |= DBits[*MI];
uint64_t MinBW = (sizeof(LeaderDemandedBits) * 8) -
llvm::countLeadingZeros(LeaderDemandedBits);
// Round up to a power of 2
if (!isPowerOf2_64((uint64_t)MinBW))
MinBW = NextPowerOf2(MinBW);
// We don't modify the types of PHIs. Reductions will already have been
// truncated if possible, and inductions' sizes will have been chosen by
// indvars.
// If we are required to shrink a PHI, abandon this entire equivalence class.
bool Abort = false;
for (auto MI = ECs.member_begin(I), ME = ECs.member_end(); MI != ME; ++MI)
if (isa<PHINode>(*MI) && MinBW < (*MI)->getType()->getScalarSizeInBits()) {
Abort = true;
break;
}
if (Abort)
continue;
[LoopVectorize] Shrink integer operations into the smallest type possible C semantics force sub-int-sized values (e.g. i8, i16) to be promoted to int type (e.g. i32) whenever arithmetic is performed on them. For targets with native i8 or i16 operations, usually InstCombine can shrink the arithmetic type down again. However InstCombine refuses to create illegal types, so for targets without i8 or i16 registers, the lengthening and shrinking remains. Most SIMD ISAs (e.g. NEON) however support vectors of i8 or i16 even when their scalar equivalents do not, so during vectorization it is important to remove these lengthens and truncates when deciding the profitability of vectorization. The algorithm this uses starts at truncs and icmps, trawling their use-def chains until they terminate or instructions outside the loop are found (or unsafe instructions like inttoptr casts are found). If the use-def chains starting from different root instructions (truncs/icmps) meet, they are unioned. The demanded bits of each node in the graph are ORed together to form an overall mask of the demanded bits in the entire graph. The minimum bitwidth that graph can be truncated to is the bitwidth minus the number of leading zeroes in the overall mask. The intention is that this algorithm should "first do no harm", so it will never insert extra cast instructions. This is why the use-def graphs are unioned, so that subgraphs with different minimum bitwidths do not need casts inserted between them. This algorithm works hard to reduce compile time impact. DemandedBits are only queried if there are extends of illegal types and if a truncate to an illegal type is seen. In the general case, this results in a simple linear scan of the instructions in the loop. No non-noise compile time impact was seen on a clang bootstrap build. llvm-svn: 250032
2015-10-12 14:34:45 +02:00
for (auto MI = ECs.member_begin(I), ME = ECs.member_end(); MI != ME; ++MI) {
if (!isa<Instruction>(*MI))
continue;
Type *Ty = (*MI)->getType();
if (Roots.count(*MI))
Ty = cast<Instruction>(*MI)->getOperand(0)->getType();
if (MinBW < Ty->getScalarSizeInBits())
MinBWs[cast<Instruction>(*MI)] = MinBW;
}
}
return MinBWs;
}
/// \returns \p I after propagating metadata from \p VL.
Instruction *llvm::propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
Instruction *I0 = cast<Instruction>(VL[0]);
SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
I0->getAllMetadataOtherThanDebugLoc(Metadata);
for (auto Kind :
{LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
LLVMContext::MD_noalias, LLVMContext::MD_fpmath,
LLVMContext::MD_nontemporal, LLVMContext::MD_invariant_load}) {
MDNode *MD = I0->getMetadata(Kind);
for (int J = 1, E = VL.size(); MD && J != E; ++J) {
const Instruction *IJ = cast<Instruction>(VL[J]);
MDNode *IMD = IJ->getMetadata(Kind);
switch (Kind) {
case LLVMContext::MD_tbaa:
MD = MDNode::getMostGenericTBAA(MD, IMD);
break;
case LLVMContext::MD_alias_scope:
MD = MDNode::getMostGenericAliasScope(MD, IMD);
break;
case LLVMContext::MD_fpmath:
MD = MDNode::getMostGenericFPMath(MD, IMD);
break;
case LLVMContext::MD_noalias:
case LLVMContext::MD_nontemporal:
case LLVMContext::MD_invariant_load:
MD = MDNode::intersect(MD, IMD);
break;
default:
llvm_unreachable("unhandled metadata");
}
}
Inst->setMetadata(Kind, MD);
}
return Inst;
}
Constant *
llvm::createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF,
const InterleaveGroup<Instruction> &Group) {
// All 1's means mask is not needed.
if (Group.getNumMembers() == Group.getFactor())
return nullptr;
// TODO: support reversed access.
assert(!Group.isReverse() && "Reversed group not supported.");
SmallVector<Constant *, 16> Mask;
for (unsigned i = 0; i < VF; i++)
for (unsigned j = 0; j < Group.getFactor(); ++j) {
unsigned HasMember = Group.getMember(j) ? 1 : 0;
Mask.push_back(Builder.getInt1(HasMember));
}
return ConstantVector::get(Mask);
}
Constant *llvm::createReplicatedMask(IRBuilder<> &Builder,
unsigned ReplicationFactor, unsigned VF) {
SmallVector<Constant *, 16> MaskVec;
for (unsigned i = 0; i < VF; i++)
for (unsigned j = 0; j < ReplicationFactor; j++)
MaskVec.push_back(Builder.getInt32(i));
return ConstantVector::get(MaskVec);
}
Constant *llvm::createInterleaveMask(IRBuilder<> &Builder, unsigned VF,
unsigned NumVecs) {
SmallVector<Constant *, 16> Mask;
for (unsigned i = 0; i < VF; i++)
for (unsigned j = 0; j < NumVecs; j++)
Mask.push_back(Builder.getInt32(j * VF + i));
return ConstantVector::get(Mask);
}
Constant *llvm::createStrideMask(IRBuilder<> &Builder, unsigned Start,
unsigned Stride, unsigned VF) {
SmallVector<Constant *, 16> Mask;
for (unsigned i = 0; i < VF; i++)
Mask.push_back(Builder.getInt32(Start + i * Stride));
return ConstantVector::get(Mask);
}
Constant *llvm::createSequentialMask(IRBuilder<> &Builder, unsigned Start,
unsigned NumInts, unsigned NumUndefs) {
SmallVector<Constant *, 16> Mask;
for (unsigned i = 0; i < NumInts; i++)
Mask.push_back(Builder.getInt32(Start + i));
Constant *Undef = UndefValue::get(Builder.getInt32Ty());
for (unsigned i = 0; i < NumUndefs; i++)
Mask.push_back(Undef);
return ConstantVector::get(Mask);
}
/// A helper function for concatenating vectors. This function concatenates two
/// vectors having the same element type. If the second vector has fewer
/// elements than the first, it is padded with undefs.
static Value *concatenateTwoVectors(IRBuilder<> &Builder, Value *V1,
Value *V2) {
VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
assert(VecTy1 && VecTy2 &&
VecTy1->getScalarType() == VecTy2->getScalarType() &&
"Expect two vectors with the same element type");
unsigned NumElts1 = VecTy1->getNumElements();
unsigned NumElts2 = VecTy2->getNumElements();
assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
if (NumElts1 > NumElts2) {
// Extend with UNDEFs.
Constant *ExtMask =
createSequentialMask(Builder, 0, NumElts2, NumElts1 - NumElts2);
V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask);
}
Constant *Mask = createSequentialMask(Builder, 0, NumElts1 + NumElts2, 0);
return Builder.CreateShuffleVector(V1, V2, Mask);
}
Value *llvm::concatenateVectors(IRBuilder<> &Builder, ArrayRef<Value *> Vecs) {
unsigned NumVecs = Vecs.size();
assert(NumVecs > 1 && "Should be at least two vectors");
SmallVector<Value *, 8> ResList;
ResList.append(Vecs.begin(), Vecs.end());
do {
SmallVector<Value *, 8> TmpList;
for (unsigned i = 0; i < NumVecs - 1; i += 2) {
Value *V0 = ResList[i], *V1 = ResList[i + 1];
assert((V0->getType() == V1->getType() || i == NumVecs - 2) &&
"Only the last vector may have a different type");
TmpList.push_back(concatenateTwoVectors(Builder, V0, V1));
}
// Push the last vector if the total number of vectors is odd.
if (NumVecs % 2 != 0)
TmpList.push_back(ResList[NumVecs - 1]);
ResList = TmpList;
NumVecs = ResList.size();
} while (NumVecs > 1);
return ResList[0];
}
bool InterleavedAccessInfo::isStrided(int Stride) {
unsigned Factor = std::abs(Stride);
return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
}
void InterleavedAccessInfo::collectConstStrideAccesses(
MapVector<Instruction *, StrideDescriptor> &AccessStrideInfo,
const ValueToValueMap &Strides) {
auto &DL = TheLoop->getHeader()->getModule()->getDataLayout();
// Since it's desired that the load/store instructions be maintained in
// "program order" for the interleaved access analysis, we have to visit the
// blocks in the loop in reverse postorder (i.e., in a topological order).
// Such an ordering will ensure that any load/store that may be executed
// before a second load/store will precede the second load/store in
// AccessStrideInfo.
LoopBlocksDFS DFS(TheLoop);
DFS.perform(LI);
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
for (auto &I : *BB) {
auto *LI = dyn_cast<LoadInst>(&I);
auto *SI = dyn_cast<StoreInst>(&I);
if (!LI && !SI)
continue;
Value *Ptr = getLoadStorePointerOperand(&I);
// We don't check wrapping here because we don't know yet if Ptr will be
// part of a full group or a group with gaps. Checking wrapping for all
// pointers (even those that end up in groups with no gaps) will be overly
// conservative. For full groups, wrapping should be ok since if we would
// wrap around the address space we would do a memory access at nullptr
// even without the transformation. The wrapping checks are therefore
// deferred until after we've formed the interleaved groups.
int64_t Stride = getPtrStride(PSE, Ptr, TheLoop, Strides,
/*Assume=*/true, /*ShouldCheckWrap=*/false);
const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
// An alignment of 0 means target ABI alignment.
unsigned Align = getLoadStoreAlignment(&I);
if (!Align)
Align = DL.getABITypeAlignment(PtrTy->getElementType());
AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, Align);
}
}
// Analyze interleaved accesses and collect them into interleaved load and
// store groups.
//
// When generating code for an interleaved load group, we effectively hoist all
// loads in the group to the location of the first load in program order. When
// generating code for an interleaved store group, we sink all stores to the
// location of the last store. This code motion can change the order of load
// and store instructions and may break dependences.
//
// The code generation strategy mentioned above ensures that we won't violate
// any write-after-read (WAR) dependences.
//
// E.g., for the WAR dependence: a = A[i]; // (1)
// A[i] = b; // (2)
//
// The store group of (2) is always inserted at or below (2), and the load
// group of (1) is always inserted at or above (1). Thus, the instructions will
// never be reordered. All other dependences are checked to ensure the
// correctness of the instruction reordering.
//
// The algorithm visits all memory accesses in the loop in bottom-up program
// order. Program order is established by traversing the blocks in the loop in
// reverse postorder when collecting the accesses.
//
// We visit the memory accesses in bottom-up order because it can simplify the
// construction of store groups in the presence of write-after-write (WAW)
// dependences.
//
// E.g., for the WAW dependence: A[i] = a; // (1)
// A[i] = b; // (2)
// A[i + 1] = c; // (3)
//
// We will first create a store group with (3) and (2). (1) can't be added to
// this group because it and (2) are dependent. However, (1) can be grouped
// with other accesses that may precede it in program order. Note that a
// bottom-up order does not imply that WAW dependences should not be checked.
void InterleavedAccessInfo::analyzeInterleaving(
bool EnablePredicatedInterleavedMemAccesses) {
LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
const ValueToValueMap &Strides = LAI->getSymbolicStrides();
// Holds all accesses with a constant stride.
MapVector<Instruction *, StrideDescriptor> AccessStrideInfo;
collectConstStrideAccesses(AccessStrideInfo, Strides);
if (AccessStrideInfo.empty())
return;
// Collect the dependences in the loop.
collectDependences();
// Holds all interleaved store groups temporarily.
SmallSetVector<InterleaveGroup<Instruction> *, 4> StoreGroups;
// Holds all interleaved load groups temporarily.
SmallSetVector<InterleaveGroup<Instruction> *, 4> LoadGroups;
// Search in bottom-up program order for pairs of accesses (A and B) that can
// form interleaved load or store groups. In the algorithm below, access A
// precedes access B in program order. We initialize a group for B in the
// outer loop of the algorithm, and then in the inner loop, we attempt to
// insert each A into B's group if:
//
// 1. A and B have the same stride,
// 2. A and B have the same memory object size, and
// 3. A belongs in B's group according to its distance from B.
//
// Special care is taken to ensure group formation will not break any
// dependences.
for (auto BI = AccessStrideInfo.rbegin(), E = AccessStrideInfo.rend();
BI != E; ++BI) {
Instruction *B = BI->first;
StrideDescriptor DesB = BI->second;
// Initialize a group for B if it has an allowable stride. Even if we don't
// create a group for B, we continue with the bottom-up algorithm to ensure
// we don't break any of B's dependences.
InterleaveGroup<Instruction> *Group = nullptr;
if (isStrided(DesB.Stride) &&
(!isPredicated(B->getParent()) || EnablePredicatedInterleavedMemAccesses)) {
Group = getInterleaveGroup(B);
if (!Group) {
LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B
<< '\n');
Group = createInterleaveGroup(B, DesB.Stride, DesB.Align);
}
if (B->mayWriteToMemory())
StoreGroups.insert(Group);
else
LoadGroups.insert(Group);
}
for (auto AI = std::next(BI); AI != E; ++AI) {
Instruction *A = AI->first;
StrideDescriptor DesA = AI->second;
// Our code motion strategy implies that we can't have dependences
// between accesses in an interleaved group and other accesses located
// between the first and last member of the group. Note that this also
// means that a group can't have more than one member at a given offset.
// The accesses in a group can have dependences with other accesses, but
// we must ensure we don't extend the boundaries of the group such that
// we encompass those dependent accesses.
//
// For example, assume we have the sequence of accesses shown below in a
// stride-2 loop:
//
// (1, 2) is a group | A[i] = a; // (1)
// | A[i-1] = b; // (2) |
// A[i-3] = c; // (3)
// A[i] = d; // (4) | (2, 4) is not a group
//
// Because accesses (2) and (3) are dependent, we can group (2) with (1)
// but not with (4). If we did, the dependent access (3) would be within
// the boundaries of the (2, 4) group.
if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI)) {
// If a dependence exists and A is already in a group, we know that A
// must be a store since A precedes B and WAR dependences are allowed.
// Thus, A would be sunk below B. We release A's group to prevent this
// illegal code motion. A will then be free to form another group with
// instructions that precede it.
if (isInterleaved(A)) {
InterleaveGroup<Instruction> *StoreGroup = getInterleaveGroup(A);
StoreGroups.remove(StoreGroup);
releaseGroup(StoreGroup);
}
// If a dependence exists and A is not already in a group (or it was
// and we just released it), B might be hoisted above A (if B is a
// load) or another store might be sunk below A (if B is a store). In
// either case, we can't add additional instructions to B's group. B
// will only form a group with instructions that it precedes.
break;
}
// At this point, we've checked for illegal code motion. If either A or B
// isn't strided, there's nothing left to do.
if (!isStrided(DesA.Stride) || !isStrided(DesB.Stride))
continue;
// Ignore A if it's already in a group or isn't the same kind of memory
// operation as B.
// Note that mayReadFromMemory() isn't mutually exclusive to
// mayWriteToMemory in the case of atomic loads. We shouldn't see those
// here, canVectorizeMemory() should have returned false - except for the
// case we asked for optimization remarks.
if (isInterleaved(A) ||
(A->mayReadFromMemory() != B->mayReadFromMemory()) ||
(A->mayWriteToMemory() != B->mayWriteToMemory()))
continue;
// Check rules 1 and 2. Ignore A if its stride or size is different from
// that of B.
if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size)
continue;
// Ignore A if the memory object of A and B don't belong to the same
// address space
if (getLoadStoreAddressSpace(A) != getLoadStoreAddressSpace(B))
continue;
// Calculate the distance from A to B.
const SCEVConstant *DistToB = dyn_cast<SCEVConstant>(
PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev));
if (!DistToB)
continue;
int64_t DistanceToB = DistToB->getAPInt().getSExtValue();
// Check rule 3. Ignore A if its distance to B is not a multiple of the
// size.
if (DistanceToB % static_cast<int64_t>(DesB.Size))
continue;
// All members of a predicated interleave-group must have the same predicate,
// and currently must reside in the same BB.
BasicBlock *BlockA = A->getParent();
BasicBlock *BlockB = B->getParent();
if ((isPredicated(BlockA) || isPredicated(BlockB)) &&
(!EnablePredicatedInterleavedMemAccesses || BlockA != BlockB))
continue;
// The index of A is the index of B plus A's distance to B in multiples
// of the size.
int IndexA =
Group->getIndex(B) + DistanceToB / static_cast<int64_t>(DesB.Size);
// Try to insert A into B's group.
if (Group->insertMember(A, IndexA, DesA.Align)) {
LLVM_DEBUG(dbgs() << "LV: Inserted:" << *A << '\n'
<< " into the interleave group with" << *B
<< '\n');
InterleaveGroupMap[A] = Group;
// Set the first load in program order as the insert position.
if (A->mayReadFromMemory())
Group->setInsertPos(A);
}
} // Iteration over A accesses.
} // Iteration over B accesses.
// Remove interleaved store groups with gaps.
for (auto *Group : StoreGroups)
if (Group->getNumMembers() != Group->getFactor()) {
LLVM_DEBUG(
dbgs() << "LV: Invalidate candidate interleaved store group due "
"to gaps.\n");
releaseGroup(Group);
}
// Remove interleaved groups with gaps (currently only loads) whose memory
// accesses may wrap around. We have to revisit the getPtrStride analysis,
// this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
// not check wrapping (see documentation there).
// FORNOW we use Assume=false;
// TODO: Change to Assume=true but making sure we don't exceed the threshold
// of runtime SCEV assumptions checks (thereby potentially failing to
// vectorize altogether).
// Additional optional optimizations:
// TODO: If we are peeling the loop and we know that the first pointer doesn't
// wrap then we can deduce that all pointers in the group don't wrap.
// This means that we can forcefully peel the loop in order to only have to
// check the first pointer for no-wrap. When we'll change to use Assume=true
// we'll only need at most one runtime check per interleaved group.
for (auto *Group : LoadGroups) {
// Case 1: A full group. Can Skip the checks; For full groups, if the wide
// load would wrap around the address space we would do a memory access at
// nullptr even without the transformation.
if (Group->getNumMembers() == Group->getFactor())
continue;
// Case 2: If first and last members of the group don't wrap this implies
// that all the pointers in the group don't wrap.
// So we check only group member 0 (which is always guaranteed to exist),
// and group member Factor - 1; If the latter doesn't exist we rely on
// peeling (if it is a non-reveresed accsess -- see Case 3).
Value *FirstMemberPtr = getLoadStorePointerOperand(Group->getMember(0));
if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false,
/*ShouldCheckWrap=*/true)) {
LLVM_DEBUG(
dbgs() << "LV: Invalidate candidate interleaved group due to "
"first group member potentially pointer-wrapping.\n");
releaseGroup(Group);
continue;
}
Instruction *LastMember = Group->getMember(Group->getFactor() - 1);
if (LastMember) {
Value *LastMemberPtr = getLoadStorePointerOperand(LastMember);
if (!getPtrStride(PSE, LastMemberPtr, TheLoop, Strides, /*Assume=*/false,
/*ShouldCheckWrap=*/true)) {
LLVM_DEBUG(
dbgs() << "LV: Invalidate candidate interleaved group due to "
"last group member potentially pointer-wrapping.\n");
releaseGroup(Group);
}
} else {
// Case 3: A non-reversed interleaved load group with gaps: We need
// to execute at least one scalar epilogue iteration. This will ensure
// we don't speculatively access memory out-of-bounds. We only need
// to look for a member at index factor - 1, since every group must have
// a member at index zero.
if (Group->isReverse()) {
LLVM_DEBUG(
dbgs() << "LV: Invalidate candidate interleaved group due to "
"a reverse access with gaps.\n");
releaseGroup(Group);
continue;
}
LLVM_DEBUG(
dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
RequiresScalarEpilogue = true;
}
}
}
void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
// If no group had triggered the requirement to create an epilogue loop,
// there is nothing to do.
if (!requiresScalarEpilogue())
return;
// Avoid releasing a Group twice.
SmallPtrSet<InterleaveGroup<Instruction> *, 4> DelSet;
for (auto &I : InterleaveGroupMap) {
InterleaveGroup<Instruction> *Group = I.second;
if (Group->requiresScalarEpilogue())
DelSet.insert(Group);
}
for (auto *Ptr : DelSet) {
LLVM_DEBUG(
dbgs()
<< "LV: Invalidate candidate interleaved group due to gaps that "
"require a scalar epilogue (not allowed under optsize) and cannot "
"be masked (not enabled). \n");
releaseGroup(Ptr);
}
RequiresScalarEpilogue = false;
}
template <>
void InterleaveGroup<Instruction>::addMetadata(Instruction *NewInst) const {
SmallVector<Value *, 4> VL;
std::transform(Members.begin(), Members.end(), std::back_inserter(VL),
[](std::pair<int, Instruction *> p) { return p.second; });
propagateMetadata(NewInst, VL);
}
template <typename InstT>
void InterleaveGroup<InstT>::addMetadata(InstT *NewInst) const {
llvm_unreachable("addMetadata can only be used for Instruction");
}