2017-10-17 23:27:42 +02:00
|
|
|
//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
|
2016-07-01 01:11:38 +02:00
|
|
|
//
|
2019-01-19 09:50:56 +01:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2016-07-01 01:11:38 +02:00
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
2018-01-10 04:02:12 +01:00
|
|
|
//
|
|
|
|
// This pass merges loads/stores to/from sequential memory addresses into vector
|
|
|
|
// loads/stores. Although there's nothing GPU-specific in here, this pass is
|
|
|
|
// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
|
|
|
|
//
|
|
|
|
// (For simplicity below we talk about loads only, but everything also applies
|
|
|
|
// to stores.)
|
|
|
|
//
|
|
|
|
// This pass is intended to be run late in the pipeline, after other
|
|
|
|
// vectorization opportunities have been exploited. So the assumption here is
|
|
|
|
// that immediately following our new vector load we'll need to extract out the
|
|
|
|
// individual elements of the load, so we can operate on them individually.
|
|
|
|
//
|
|
|
|
// On CPUs this transformation is usually not beneficial, because extracting the
|
|
|
|
// elements of a vector register is expensive on most architectures. It's
|
|
|
|
// usually better just to load each element individually into its own scalar
|
|
|
|
// register.
|
|
|
|
//
|
|
|
|
// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
|
|
|
|
// "vector load" loads directly into a series of scalar registers. In effect,
|
|
|
|
// extracting the elements of the vector is free. It's therefore always
|
|
|
|
// beneficial to vectorize a sequence of loads on these architectures.
|
|
|
|
//
|
|
|
|
// Vectorizing (perhaps a better name might be "coalescing") loads can have
|
|
|
|
// large performance impacts on GPU kernels, and opportunities for vectorizing
|
|
|
|
// are common in GPU code. This pass tries very hard to find such
|
|
|
|
// opportunities; its runtime is quadratic in the number of loads in a BB.
|
|
|
|
//
|
|
|
|
// Some CPU architectures, such as ARM, have instructions that load into
|
|
|
|
// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
|
|
|
|
// could use this pass (with some modifications), but currently it implements
|
|
|
|
// its own pass to do something similar to what we do here.
|
2016-07-01 01:11:38 +02:00
|
|
|
|
Sink all InitializePasses.h includes
This file lists every pass in LLVM, and is included by Pass.h, which is
very popular. Every time we add, remove, or rename a pass in LLVM, it
caused lots of recompilation.
I found this fact by looking at this table, which is sorted by the
number of times a file was changed over the last 100,000 git commits
multiplied by the number of object files that depend on it in the
current checkout:
recompiles touches affected_files header
342380 95 3604 llvm/include/llvm/ADT/STLExtras.h
314730 234 1345 llvm/include/llvm/InitializePasses.h
307036 118 2602 llvm/include/llvm/ADT/APInt.h
213049 59 3611 llvm/include/llvm/Support/MathExtras.h
170422 47 3626 llvm/include/llvm/Support/Compiler.h
162225 45 3605 llvm/include/llvm/ADT/Optional.h
158319 63 2513 llvm/include/llvm/ADT/Triple.h
140322 39 3598 llvm/include/llvm/ADT/StringRef.h
137647 59 2333 llvm/include/llvm/Support/Error.h
131619 73 1803 llvm/include/llvm/Support/FileSystem.h
Before this change, touching InitializePasses.h would cause 1345 files
to recompile. After this change, touching it only causes 550 compiles in
an incremental rebuild.
Reviewers: bkramer, asbirlea, bollu, jdoerfert
Differential Revision: https://reviews.llvm.org/D70211
2019-11-13 22:15:01 +01:00
|
|
|
#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
|
2017-10-17 23:27:42 +02:00
|
|
|
#include "llvm/ADT/APInt.h"
|
|
|
|
#include "llvm/ADT/ArrayRef.h"
|
2016-07-01 01:11:38 +02:00
|
|
|
#include "llvm/ADT/MapVector.h"
|
|
|
|
#include "llvm/ADT/PostOrderIterator.h"
|
2017-10-17 23:27:42 +02:00
|
|
|
#include "llvm/ADT/STLExtras.h"
|
|
|
|
#include "llvm/ADT/SmallPtrSet.h"
|
|
|
|
#include "llvm/ADT/SmallVector.h"
|
2016-07-01 01:11:38 +02:00
|
|
|
#include "llvm/ADT/Statistic.h"
|
2017-10-17 23:27:42 +02:00
|
|
|
#include "llvm/ADT/iterator_range.h"
|
2016-07-01 01:11:38 +02:00
|
|
|
#include "llvm/Analysis/AliasAnalysis.h"
|
2021-04-30 22:37:08 +02:00
|
|
|
#include "llvm/Analysis/AssumptionCache.h"
|
2017-10-17 23:27:42 +02:00
|
|
|
#include "llvm/Analysis/MemoryLocation.h"
|
2016-07-01 01:11:38 +02:00
|
|
|
#include "llvm/Analysis/ScalarEvolution.h"
|
|
|
|
#include "llvm/Analysis/TargetTransformInfo.h"
|
|
|
|
#include "llvm/Analysis/ValueTracking.h"
|
|
|
|
#include "llvm/Analysis/VectorUtils.h"
|
2017-10-17 23:27:42 +02:00
|
|
|
#include "llvm/IR/Attributes.h"
|
|
|
|
#include "llvm/IR/BasicBlock.h"
|
|
|
|
#include "llvm/IR/Constants.h"
|
2016-07-01 01:11:38 +02:00
|
|
|
#include "llvm/IR/DataLayout.h"
|
2017-10-17 23:27:42 +02:00
|
|
|
#include "llvm/IR/DerivedTypes.h"
|
2016-07-01 01:11:38 +02:00
|
|
|
#include "llvm/IR/Dominators.h"
|
2017-10-17 23:27:42 +02:00
|
|
|
#include "llvm/IR/Function.h"
|
2016-07-01 01:11:38 +02:00
|
|
|
#include "llvm/IR/IRBuilder.h"
|
2017-10-17 23:27:42 +02:00
|
|
|
#include "llvm/IR/InstrTypes.h"
|
|
|
|
#include "llvm/IR/Instruction.h"
|
2016-07-01 01:11:38 +02:00
|
|
|
#include "llvm/IR/Instructions.h"
|
Add an @llvm.sideeffect intrinsic
This patch implements Chandler's idea [0] for supporting languages that
require support for infinite loops with side effects, such as Rust, providing
part of a solution to bug 965 [1].
Specifically, it adds an `llvm.sideeffect()` intrinsic, which has no actual
effect, but which appears to optimization passes to have obscure side effects,
such that they don't optimize away loops containing it. It also teaches
several optimization passes to ignore this intrinsic, so that it doesn't
significantly impact optimization in most cases.
As discussed on llvm-dev [2], this patch is the first of two major parts.
The second part, to change LLVM's semantics to have defined behavior
on infinite loops by default, with a function attribute for opting into
potential-undefined-behavior, will be implemented and posted for review in
a separate patch.
[0] http://lists.llvm.org/pipermail/llvm-dev/2015-July/088103.html
[1] https://bugs.llvm.org/show_bug.cgi?id=965
[2] http://lists.llvm.org/pipermail/llvm-dev/2017-October/118632.html
Differential Revision: https://reviews.llvm.org/D38336
llvm-svn: 317729
2017-11-08 22:59:51 +01:00
|
|
|
#include "llvm/IR/IntrinsicInst.h"
|
2016-07-01 01:11:38 +02:00
|
|
|
#include "llvm/IR/Module.h"
|
|
|
|
#include "llvm/IR/Type.h"
|
2017-10-17 23:27:42 +02:00
|
|
|
#include "llvm/IR/User.h"
|
2016-07-01 01:11:38 +02:00
|
|
|
#include "llvm/IR/Value.h"
|
Sink all InitializePasses.h includes
This file lists every pass in LLVM, and is included by Pass.h, which is
very popular. Every time we add, remove, or rename a pass in LLVM, it
caused lots of recompilation.
I found this fact by looking at this table, which is sorted by the
number of times a file was changed over the last 100,000 git commits
multiplied by the number of object files that depend on it in the
current checkout:
recompiles touches affected_files header
342380 95 3604 llvm/include/llvm/ADT/STLExtras.h
314730 234 1345 llvm/include/llvm/InitializePasses.h
307036 118 2602 llvm/include/llvm/ADT/APInt.h
213049 59 3611 llvm/include/llvm/Support/MathExtras.h
170422 47 3626 llvm/include/llvm/Support/Compiler.h
162225 45 3605 llvm/include/llvm/ADT/Optional.h
158319 63 2513 llvm/include/llvm/ADT/Triple.h
140322 39 3598 llvm/include/llvm/ADT/StringRef.h
137647 59 2333 llvm/include/llvm/Support/Error.h
131619 73 1803 llvm/include/llvm/Support/FileSystem.h
Before this change, touching InitializePasses.h would cause 1345 files
to recompile. After this change, touching it only causes 550 compiles in
an incremental rebuild.
Reviewers: bkramer, asbirlea, bollu, jdoerfert
Differential Revision: https://reviews.llvm.org/D70211
2019-11-13 22:15:01 +01:00
|
|
|
#include "llvm/InitializePasses.h"
|
2017-10-17 23:27:42 +02:00
|
|
|
#include "llvm/Pass.h"
|
|
|
|
#include "llvm/Support/Casting.h"
|
2016-07-01 01:11:38 +02:00
|
|
|
#include "llvm/Support/Debug.h"
|
2017-04-26 18:39:58 +02:00
|
|
|
#include "llvm/Support/KnownBits.h"
|
2017-10-17 23:27:42 +02:00
|
|
|
#include "llvm/Support/MathExtras.h"
|
2016-07-01 01:11:38 +02:00
|
|
|
#include "llvm/Support/raw_ostream.h"
|
Sink all InitializePasses.h includes
This file lists every pass in LLVM, and is included by Pass.h, which is
very popular. Every time we add, remove, or rename a pass in LLVM, it
caused lots of recompilation.
I found this fact by looking at this table, which is sorted by the
number of times a file was changed over the last 100,000 git commits
multiplied by the number of object files that depend on it in the
current checkout:
recompiles touches affected_files header
342380 95 3604 llvm/include/llvm/ADT/STLExtras.h
314730 234 1345 llvm/include/llvm/InitializePasses.h
307036 118 2602 llvm/include/llvm/ADT/APInt.h
213049 59 3611 llvm/include/llvm/Support/MathExtras.h
170422 47 3626 llvm/include/llvm/Support/Compiler.h
162225 45 3605 llvm/include/llvm/ADT/Optional.h
158319 63 2513 llvm/include/llvm/ADT/Triple.h
140322 39 3598 llvm/include/llvm/ADT/StringRef.h
137647 59 2333 llvm/include/llvm/Support/Error.h
131619 73 1803 llvm/include/llvm/Support/FileSystem.h
Before this change, touching InitializePasses.h would cause 1345 files
to recompile. After this change, touching it only causes 550 compiles in
an incremental rebuild.
Reviewers: bkramer, asbirlea, bollu, jdoerfert
Differential Revision: https://reviews.llvm.org/D70211
2019-11-13 22:15:01 +01:00
|
|
|
#include "llvm/Transforms/Utils/Local.h"
|
2016-07-07 22:10:35 +02:00
|
|
|
#include "llvm/Transforms/Vectorize.h"
|
2017-10-17 23:27:42 +02:00
|
|
|
#include <algorithm>
|
|
|
|
#include <cassert>
|
|
|
|
#include <cstdlib>
|
|
|
|
#include <tuple>
|
|
|
|
#include <utility>
|
2016-07-01 01:11:38 +02:00
|
|
|
|
|
|
|
using namespace llvm;
|
|
|
|
|
|
|
|
#define DEBUG_TYPE "load-store-vectorizer"
|
2017-10-17 23:27:42 +02:00
|
|
|
|
2016-07-01 01:11:38 +02:00
|
|
|
STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
|
|
|
|
STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
|
|
|
|
|
2016-08-04 18:38:44 +02:00
|
|
|
// FIXME: Assuming stack alignment of 4 is always good enough
|
|
|
|
static const unsigned StackAdjustedAlignment = 4;
|
2017-10-17 23:27:42 +02:00
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
2018-07-25 23:33:00 +02:00
|
|
|
/// ChainID is an arbitrary token that is allowed to be different only for the
|
|
|
|
/// accesses that are guaranteed to be considered non-consecutive by
|
|
|
|
/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
|
|
|
|
/// together and reducing the number of instructions the main search operates on
|
|
|
|
/// at a time, i.e. this is to reduce compile time and nothing else as the main
|
|
|
|
/// search has O(n^2) time complexity. The underlying type of ChainID should not
|
|
|
|
/// be relied upon.
|
|
|
|
using ChainID = const Value *;
|
2017-10-17 23:27:42 +02:00
|
|
|
using InstrList = SmallVector<Instruction *, 8>;
|
2018-07-25 23:33:00 +02:00
|
|
|
using InstrListMap = MapVector<ChainID, InstrList>;
|
2016-07-01 01:11:38 +02:00
|
|
|
|
2016-07-20 01:19:16 +02:00
|
|
|
class Vectorizer {
|
2016-07-01 01:11:38 +02:00
|
|
|
Function &F;
|
|
|
|
AliasAnalysis &AA;
|
2021-04-30 22:37:08 +02:00
|
|
|
AssumptionCache &AC;
|
2016-07-01 01:11:38 +02:00
|
|
|
DominatorTree &DT;
|
|
|
|
ScalarEvolution &SE;
|
2016-07-01 04:07:22 +02:00
|
|
|
TargetTransformInfo &TTI;
|
2016-07-01 01:11:38 +02:00
|
|
|
const DataLayout &DL;
|
|
|
|
IRBuilder<> Builder;
|
|
|
|
|
|
|
|
public:
|
2021-04-30 22:37:08 +02:00
|
|
|
Vectorizer(Function &F, AliasAnalysis &AA, AssumptionCache &AC,
|
|
|
|
DominatorTree &DT, ScalarEvolution &SE, TargetTransformInfo &TTI)
|
|
|
|
: F(F), AA(AA), AC(AC), DT(DT), SE(SE), TTI(TTI),
|
2016-07-07 22:10:35 +02:00
|
|
|
DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
|
2016-07-01 01:11:38 +02:00
|
|
|
|
|
|
|
bool run();
|
|
|
|
|
|
|
|
private:
|
|
|
|
unsigned getPointerAddressSpace(Value *I);
|
|
|
|
|
2018-07-25 23:33:00 +02:00
|
|
|
static const unsigned MaxDepth = 3;
|
|
|
|
|
2016-07-01 01:11:38 +02:00
|
|
|
bool isConsecutiveAccess(Value *A, Value *B);
|
2019-08-02 06:03:37 +02:00
|
|
|
bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta,
|
2018-07-25 23:33:00 +02:00
|
|
|
unsigned Depth = 0) const;
|
|
|
|
bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
|
|
|
|
unsigned Depth) const;
|
2018-07-26 03:11:36 +02:00
|
|
|
bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
|
2018-07-25 23:33:00 +02:00
|
|
|
unsigned Depth) const;
|
2016-07-01 01:11:38 +02:00
|
|
|
|
Correct ordering of loads/stores.
Summary:
Aiming to correct the ordering of loads/stores. This patch changes the
insert point for loads to the position of the first load.
It updates the ordering method for loads to insert before, rather than after.
Before this patch the following sequence:
"load a[1], store a[1], store a[0], load a[2]"
Would incorrectly vectorize to "store a[0,1], load a[1,2]".
The correctness check was assuming the insertion point for loads is at
the position of the first load, when in practice it was at the last
load. An alternative fix would have been to invert the correctness check.
The current fix changes insert position but also requires reordering of
instructions before the vectorized load.
Updated testcases to reflect the changes.
Reviewers: tstellarAMD, llvm-commits, jlebar, arsenm
Subscribers: mzolotukhin
Differential Revision: http://reviews.llvm.org/D22071
llvm-svn: 275117
2016-07-12 00:34:29 +02:00
|
|
|
/// After vectorization, reorder the instructions that I depends on
|
|
|
|
/// (the instructions defining its operands), to ensure they dominate I.
|
2016-07-01 01:11:38 +02:00
|
|
|
void reorder(Instruction *I);
|
|
|
|
|
|
|
|
/// Returns the first and the last instructions in Chain.
|
|
|
|
std::pair<BasicBlock::iterator, BasicBlock::iterator>
|
2016-07-28 01:06:00 +02:00
|
|
|
getBoundaryInstrs(ArrayRef<Instruction *> Chain);
|
2016-07-01 01:11:38 +02:00
|
|
|
|
|
|
|
/// Erases the original instructions after vectorizing.
|
2016-07-28 01:06:00 +02:00
|
|
|
void eraseInstructions(ArrayRef<Instruction *> Chain);
|
2016-07-01 01:11:38 +02:00
|
|
|
|
|
|
|
/// "Legalize" the vector type that would be produced by combining \p
|
|
|
|
/// ElementSizeBits elements in \p Chain. Break into two pieces such that the
|
|
|
|
/// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
|
|
|
|
/// expected to have more than 4 elements.
|
2016-07-28 01:06:00 +02:00
|
|
|
std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
|
|
|
|
splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
|
2016-07-01 01:11:38 +02:00
|
|
|
|
2016-07-20 01:19:20 +02:00
|
|
|
/// Finds the largest prefix of Chain that's vectorizable, checking for
|
|
|
|
/// intervening instructions which may affect the memory accessed by the
|
|
|
|
/// instructions within Chain.
|
|
|
|
///
|
2016-07-20 02:55:12 +02:00
|
|
|
/// The elements of \p Chain must be all loads or all stores and must be in
|
|
|
|
/// address order.
|
2016-07-28 01:06:00 +02:00
|
|
|
ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
|
2016-07-01 01:11:38 +02:00
|
|
|
|
|
|
|
/// Collects load and store instructions to vectorize.
|
2016-07-28 01:06:00 +02:00
|
|
|
std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
|
2016-07-01 01:11:38 +02:00
|
|
|
|
2016-07-28 01:06:00 +02:00
|
|
|
/// Processes the collected instructions, the \p Map. The values of \p Map
|
2016-07-01 01:11:38 +02:00
|
|
|
/// should be all loads or all stores.
|
2016-07-28 01:06:00 +02:00
|
|
|
bool vectorizeChains(InstrListMap &Map);
|
2016-07-01 01:11:38 +02:00
|
|
|
|
|
|
|
/// Finds the load/stores to consecutive memory addresses and vectorizes them.
|
2016-07-28 01:06:00 +02:00
|
|
|
bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
|
2016-07-01 01:11:38 +02:00
|
|
|
|
|
|
|
/// Vectorizes the load instructions in Chain.
|
2016-07-28 01:06:00 +02:00
|
|
|
bool
|
|
|
|
vectorizeLoadChain(ArrayRef<Instruction *> Chain,
|
|
|
|
SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
|
2016-07-01 01:11:38 +02:00
|
|
|
|
|
|
|
/// Vectorizes the store instructions in Chain.
|
2016-07-28 01:06:00 +02:00
|
|
|
bool
|
|
|
|
vectorizeStoreChain(ArrayRef<Instruction *> Chain,
|
|
|
|
SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
|
2016-07-11 22:46:17 +02:00
|
|
|
|
2016-09-29 15:04:37 +02:00
|
|
|
/// Check if this load/store access is misaligned accesses.
|
2016-07-11 22:46:17 +02:00
|
|
|
bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
|
2021-02-05 04:22:04 +01:00
|
|
|
Align Alignment);
|
2016-07-01 01:11:38 +02:00
|
|
|
};
|
|
|
|
|
2018-12-07 09:23:37 +01:00
|
|
|
class LoadStoreVectorizerLegacyPass : public FunctionPass {
|
2016-07-01 01:11:38 +02:00
|
|
|
public:
|
|
|
|
static char ID;
|
|
|
|
|
2018-12-07 09:23:37 +01:00
|
|
|
LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
|
|
|
|
initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry());
|
2016-07-01 01:11:38 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
bool runOnFunction(Function &F) override;
|
|
|
|
|
2016-10-01 04:56:57 +02:00
|
|
|
StringRef getPassName() const override {
|
2016-07-01 01:11:38 +02:00
|
|
|
return "GPU Load and Store Vectorizer";
|
|
|
|
}
|
|
|
|
|
|
|
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
|
|
|
AU.addRequired<AAResultsWrapperPass>();
|
2021-04-30 22:37:08 +02:00
|
|
|
AU.addRequired<AssumptionCacheTracker>();
|
2016-07-01 01:11:38 +02:00
|
|
|
AU.addRequired<ScalarEvolutionWrapperPass>();
|
|
|
|
AU.addRequired<DominatorTreeWrapperPass>();
|
2016-07-01 04:07:22 +02:00
|
|
|
AU.addRequired<TargetTransformInfoWrapperPass>();
|
2016-07-01 01:11:38 +02:00
|
|
|
AU.setPreservesCFG();
|
|
|
|
}
|
|
|
|
};
|
2017-10-17 23:27:42 +02:00
|
|
|
|
|
|
|
} // end anonymous namespace
|
|
|
|
|
2018-12-07 09:23:37 +01:00
|
|
|
char LoadStoreVectorizerLegacyPass::ID = 0;
|
2016-07-01 01:11:38 +02:00
|
|
|
|
2018-12-07 09:23:37 +01:00
|
|
|
INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
|
2016-07-02 01:26:54 +02:00
|
|
|
"Vectorize load and Store instructions", false, false)
|
2016-07-01 01:11:38 +02:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
|
2021-04-30 22:37:08 +02:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker);
|
2016-07-01 01:11:38 +02:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
|
|
|
|
INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
|
2016-07-01 04:07:22 +02:00
|
|
|
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
|
2018-12-07 09:23:37 +01:00
|
|
|
INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
|
2016-07-02 01:26:54 +02:00
|
|
|
"Vectorize load and store instructions", false, false)
|
2016-07-01 01:11:38 +02:00
|
|
|
|
2016-07-01 04:07:22 +02:00
|
|
|
Pass *llvm::createLoadStoreVectorizerPass() {
|
2018-12-07 09:23:37 +01:00
|
|
|
return new LoadStoreVectorizerLegacyPass();
|
2016-07-01 01:11:38 +02:00
|
|
|
}
|
|
|
|
|
2018-12-07 09:23:37 +01:00
|
|
|
bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
|
2016-07-01 01:50:18 +02:00
|
|
|
// Don't vectorize when the attribute NoImplicitFloat is used.
|
|
|
|
if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
|
|
|
|
return false;
|
|
|
|
|
2016-07-01 01:11:38 +02:00
|
|
|
AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
|
|
|
|
DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
|
|
|
ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
|
2016-07-07 22:10:35 +02:00
|
|
|
TargetTransformInfo &TTI =
|
|
|
|
getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
|
2016-07-01 01:11:38 +02:00
|
|
|
|
2021-04-30 22:37:08 +02:00
|
|
|
AssumptionCache &AC =
|
|
|
|
getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
|
|
|
|
|
|
|
|
Vectorizer V(F, AA, AC, DT, SE, TTI);
|
2016-07-01 01:11:38 +02:00
|
|
|
return V.run();
|
|
|
|
}
|
|
|
|
|
2018-12-07 09:23:37 +01:00
|
|
|
PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
|
|
|
|
// Don't vectorize when the attribute NoImplicitFloat is used.
|
|
|
|
if (F.hasFnAttribute(Attribute::NoImplicitFloat))
|
|
|
|
return PreservedAnalyses::all();
|
|
|
|
|
|
|
|
AliasAnalysis &AA = AM.getResult<AAManager>(F);
|
|
|
|
DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
|
|
|
|
ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
|
|
|
|
TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
|
2021-04-30 22:37:08 +02:00
|
|
|
AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
|
2018-12-07 09:23:37 +01:00
|
|
|
|
2021-04-30 22:37:08 +02:00
|
|
|
Vectorizer V(F, AA, AC, DT, SE, TTI);
|
2018-12-07 09:23:37 +01:00
|
|
|
bool Changed = V.run();
|
|
|
|
PreservedAnalyses PA;
|
|
|
|
PA.preserveSet<CFGAnalyses>();
|
|
|
|
return Changed ? PA : PreservedAnalyses::all();
|
|
|
|
}
|
|
|
|
|
|
|
|
// The real propagateMetadata expects a SmallVector<Value*>, but we deal in
|
|
|
|
// vectors of Instructions.
|
|
|
|
static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
|
|
|
|
SmallVector<Value *, 8> VL(IL.begin(), IL.end());
|
|
|
|
propagateMetadata(I, VL);
|
|
|
|
}
|
|
|
|
|
2016-07-01 01:11:38 +02:00
|
|
|
// Vectorizer Implementation
|
|
|
|
bool Vectorizer::run() {
|
|
|
|
bool Changed = false;
|
|
|
|
|
|
|
|
// Scan the blocks in the function in post order.
|
|
|
|
for (BasicBlock *BB : post_order(&F)) {
|
2016-07-28 01:06:00 +02:00
|
|
|
InstrListMap LoadRefs, StoreRefs;
|
2016-07-20 01:19:16 +02:00
|
|
|
std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
|
2016-07-01 01:11:38 +02:00
|
|
|
Changed |= vectorizeChains(LoadRefs);
|
|
|
|
Changed |= vectorizeChains(StoreRefs);
|
|
|
|
}
|
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
|
|
|
unsigned Vectorizer::getPointerAddressSpace(Value *I) {
|
|
|
|
if (LoadInst *L = dyn_cast<LoadInst>(I))
|
|
|
|
return L->getPointerAddressSpace();
|
|
|
|
if (StoreInst *S = dyn_cast<StoreInst>(I))
|
|
|
|
return S->getPointerAddressSpace();
|
|
|
|
return -1;
|
|
|
|
}
|
|
|
|
|
|
|
|
// FIXME: Merge with llvm::isConsecutiveAccess
|
|
|
|
bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
|
2018-03-09 22:05:58 +01:00
|
|
|
Value *PtrA = getLoadStorePointerOperand(A);
|
|
|
|
Value *PtrB = getLoadStorePointerOperand(B);
|
2016-07-01 01:11:38 +02:00
|
|
|
unsigned ASA = getPointerAddressSpace(A);
|
|
|
|
unsigned ASB = getPointerAddressSpace(B);
|
|
|
|
|
|
|
|
// Check that the address spaces match and that the pointers are valid.
|
|
|
|
if (!PtrA || !PtrB || (ASA != ASB))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
// Make sure that A and B are different pointers of the same size type.
|
2021-06-27 15:40:43 +02:00
|
|
|
Type *PtrATy = getLoadStoreType(A);
|
|
|
|
Type *PtrBTy = getLoadStoreType(B);
|
2016-07-01 01:11:38 +02:00
|
|
|
if (PtrA == PtrB ||
|
2018-03-07 11:29:28 +01:00
|
|
|
PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
|
2016-07-01 01:11:38 +02:00
|
|
|
DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
|
|
|
|
DL.getTypeStoreSize(PtrATy->getScalarType()) !=
|
2016-07-07 22:10:35 +02:00
|
|
|
DL.getTypeStoreSize(PtrBTy->getScalarType()))
|
2016-07-01 01:11:38 +02:00
|
|
|
return false;
|
|
|
|
|
2018-07-20 22:10:04 +02:00
|
|
|
unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
|
2016-07-01 01:11:38 +02:00
|
|
|
APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
|
|
|
|
|
2018-07-20 22:10:04 +02:00
|
|
|
return areConsecutivePointers(PtrA, PtrB, Size);
|
|
|
|
}
|
|
|
|
|
2018-07-25 23:33:00 +02:00
|
|
|
bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
|
2019-08-02 06:03:37 +02:00
|
|
|
APInt PtrDelta, unsigned Depth) const {
|
2018-07-20 22:10:04 +02:00
|
|
|
unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
|
|
|
|
APInt OffsetA(PtrBitWidth, 0);
|
|
|
|
APInt OffsetB(PtrBitWidth, 0);
|
2016-07-01 01:11:38 +02:00
|
|
|
PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
|
|
|
|
PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
|
|
|
|
|
2019-08-02 06:03:37 +02:00
|
|
|
unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
|
|
|
|
|
|
|
|
if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
|
2019-07-31 18:33:11 +02:00
|
|
|
return false;
|
|
|
|
|
2019-08-02 06:03:37 +02:00
|
|
|
// In case if we have to shrink the pointer
|
|
|
|
// stripAndAccumulateInBoundsConstantOffsets should properly handle a
|
|
|
|
// possible overflow and the value should fit into a smallest data type
|
|
|
|
// used in the cast/gep chain.
|
|
|
|
assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth &&
|
|
|
|
OffsetB.getMinSignedBits() <= NewPtrBitWidth);
|
|
|
|
|
|
|
|
OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
|
|
|
|
OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
|
|
|
|
PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth);
|
|
|
|
|
2016-07-01 01:11:38 +02:00
|
|
|
APInt OffsetDelta = OffsetB - OffsetA;
|
|
|
|
|
|
|
|
// Check if they are based on the same pointer. That makes the offsets
|
|
|
|
// sufficient.
|
|
|
|
if (PtrA == PtrB)
|
2018-07-25 23:33:00 +02:00
|
|
|
return OffsetDelta == PtrDelta;
|
2016-07-01 01:11:38 +02:00
|
|
|
|
|
|
|
// Compute the necessary base pointer delta to have the necessary final delta
|
2018-07-25 23:33:00 +02:00
|
|
|
// equal to the pointer delta requested.
|
|
|
|
APInt BaseDelta = PtrDelta - OffsetDelta;
|
2016-07-01 01:11:38 +02:00
|
|
|
|
|
|
|
// Compute the distance with SCEV between the base pointers.
|
|
|
|
const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
|
|
|
|
const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
|
|
|
|
const SCEV *C = SE.getConstant(BaseDelta);
|
|
|
|
const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
|
|
|
|
if (X == PtrSCEVB)
|
|
|
|
return true;
|
|
|
|
|
2018-07-19 18:50:27 +02:00
|
|
|
// The above check will not catch the cases where one of the pointers is
|
|
|
|
// factorized but the other one is not, such as (C + (S * (A + B))) vs
|
|
|
|
// (AS + BS). Get the minus scev. That will allow re-combining the expresions
|
|
|
|
// and getting the simplified difference.
|
|
|
|
const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
|
|
|
|
if (C == Dist)
|
|
|
|
return true;
|
|
|
|
|
2016-07-01 01:11:38 +02:00
|
|
|
// Sometimes even this doesn't work, because SCEV can't always see through
|
|
|
|
// patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
|
|
|
|
// things the hard way.
|
2018-07-25 23:33:00 +02:00
|
|
|
return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
|
2018-07-20 22:10:04 +02:00
|
|
|
}
|
|
|
|
|
2021-06-11 01:01:01 +02:00
|
|
|
static bool checkNoWrapFlags(Instruction *I, bool Signed) {
|
|
|
|
BinaryOperator *BinOpI = cast<BinaryOperator>(I);
|
|
|
|
return (Signed && BinOpI->hasNoSignedWrap()) ||
|
|
|
|
(!Signed && BinOpI->hasNoUnsignedWrap());
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool checkIfSafeAddSequence(const APInt &IdxDiff, Instruction *AddOpA,
|
|
|
|
unsigned MatchingOpIdxA, Instruction *AddOpB,
|
|
|
|
unsigned MatchingOpIdxB, bool Signed) {
|
|
|
|
// If both OpA and OpB is an add with NSW/NUW and with
|
|
|
|
// one of the operands being the same, we can guarantee that the
|
|
|
|
// transformation is safe if we can prove that OpA won't overflow when
|
|
|
|
// IdxDiff added to the other operand of OpA.
|
|
|
|
// For example:
|
|
|
|
// %tmp7 = add nsw i32 %tmp2, %v0
|
|
|
|
// %tmp8 = sext i32 %tmp7 to i64
|
|
|
|
// ...
|
|
|
|
// %tmp11 = add nsw i32 %v0, 1
|
|
|
|
// %tmp12 = add nsw i32 %tmp2, %tmp11
|
|
|
|
// %tmp13 = sext i32 %tmp12 to i64
|
|
|
|
//
|
|
|
|
// Both %tmp7 and %tmp2 has the nsw flag and the first operand
|
|
|
|
// is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow
|
|
|
|
// because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the
|
|
|
|
// nsw flag.
|
|
|
|
assert(AddOpA->getOpcode() == Instruction::Add &&
|
|
|
|
AddOpB->getOpcode() == Instruction::Add &&
|
|
|
|
checkNoWrapFlags(AddOpA, Signed) && checkNoWrapFlags(AddOpB, Signed));
|
|
|
|
if (AddOpA->getOperand(MatchingOpIdxA) ==
|
|
|
|
AddOpB->getOperand(MatchingOpIdxB)) {
|
|
|
|
Value *OtherOperandA = AddOpA->getOperand(MatchingOpIdxA == 1 ? 0 : 1);
|
|
|
|
Value *OtherOperandB = AddOpB->getOperand(MatchingOpIdxB == 1 ? 0 : 1);
|
|
|
|
Instruction *OtherInstrA = dyn_cast<Instruction>(OtherOperandA);
|
|
|
|
Instruction *OtherInstrB = dyn_cast<Instruction>(OtherOperandB);
|
|
|
|
// Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
|
|
|
|
if (OtherInstrB && OtherInstrB->getOpcode() == Instruction::Add &&
|
|
|
|
checkNoWrapFlags(OtherInstrB, Signed) &&
|
|
|
|
isa<ConstantInt>(OtherInstrB->getOperand(1))) {
|
|
|
|
int64_t CstVal =
|
|
|
|
cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
|
|
|
|
if (OtherInstrB->getOperand(0) == OtherOperandA &&
|
|
|
|
IdxDiff.getSExtValue() == CstVal)
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
// Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
|
|
|
|
if (OtherInstrA && OtherInstrA->getOpcode() == Instruction::Add &&
|
|
|
|
checkNoWrapFlags(OtherInstrA, Signed) &&
|
|
|
|
isa<ConstantInt>(OtherInstrA->getOperand(1))) {
|
|
|
|
int64_t CstVal =
|
|
|
|
cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
|
|
|
|
if (OtherInstrA->getOperand(0) == OtherOperandB &&
|
|
|
|
IdxDiff.getSExtValue() == -CstVal)
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
// Match `x +nsw/nuw (y +nsw/nuw c)` and
|
|
|
|
// `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
|
|
|
|
if (OtherInstrA && OtherInstrB &&
|
|
|
|
OtherInstrA->getOpcode() == Instruction::Add &&
|
|
|
|
OtherInstrB->getOpcode() == Instruction::Add &&
|
|
|
|
checkNoWrapFlags(OtherInstrA, Signed) &&
|
|
|
|
checkNoWrapFlags(OtherInstrB, Signed) &&
|
|
|
|
isa<ConstantInt>(OtherInstrA->getOperand(1)) &&
|
|
|
|
isa<ConstantInt>(OtherInstrB->getOperand(1))) {
|
|
|
|
int64_t CstValA =
|
|
|
|
cast<ConstantInt>(OtherInstrA->getOperand(1))->getSExtValue();
|
|
|
|
int64_t CstValB =
|
|
|
|
cast<ConstantInt>(OtherInstrB->getOperand(1))->getSExtValue();
|
|
|
|
if (OtherInstrA->getOperand(0) == OtherInstrB->getOperand(0) &&
|
|
|
|
IdxDiff.getSExtValue() == (CstValB - CstValA))
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2018-07-20 22:10:04 +02:00
|
|
|
bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
|
2018-07-25 23:33:00 +02:00
|
|
|
APInt PtrDelta,
|
|
|
|
unsigned Depth) const {
|
2018-07-20 22:10:04 +02:00
|
|
|
auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
|
|
|
|
auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
|
|
|
|
if (!GEPA || !GEPB)
|
2018-07-25 23:33:00 +02:00
|
|
|
return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
|
2016-07-01 01:11:38 +02:00
|
|
|
|
|
|
|
// Look through GEPs after checking they're the same except for the last
|
|
|
|
// index.
|
2018-07-20 22:10:04 +02:00
|
|
|
if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
|
|
|
|
GEPA->getPointerOperand() != GEPB->getPointerOperand())
|
2016-07-01 01:11:38 +02:00
|
|
|
return false;
|
2018-07-20 22:10:04 +02:00
|
|
|
gep_type_iterator GTIA = gep_type_begin(GEPA);
|
|
|
|
gep_type_iterator GTIB = gep_type_begin(GEPB);
|
|
|
|
for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
|
|
|
|
if (GTIA.getOperand() != GTIB.getOperand())
|
2016-07-01 01:11:38 +02:00
|
|
|
return false;
|
2018-07-20 22:10:04 +02:00
|
|
|
++GTIA;
|
|
|
|
++GTIB;
|
|
|
|
}
|
2016-07-01 01:11:38 +02:00
|
|
|
|
2018-07-20 22:10:04 +02:00
|
|
|
Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
|
|
|
|
Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
|
2016-07-01 01:11:38 +02:00
|
|
|
if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
|
|
|
|
OpA->getType() != OpB->getType())
|
|
|
|
return false;
|
|
|
|
|
2018-07-20 22:10:04 +02:00
|
|
|
if (PtrDelta.isNegative()) {
|
|
|
|
if (PtrDelta.isMinSignedValue())
|
|
|
|
return false;
|
|
|
|
PtrDelta.negate();
|
|
|
|
std::swap(OpA, OpB);
|
|
|
|
}
|
|
|
|
uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
|
|
|
|
if (PtrDelta.urem(Stride) != 0)
|
|
|
|
return false;
|
|
|
|
unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
|
|
|
|
APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
|
|
|
|
|
2016-07-01 01:11:38 +02:00
|
|
|
// Only look through a ZExt/SExt.
|
|
|
|
if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
|
|
|
|
return false;
|
|
|
|
|
2016-07-01 04:16:24 +02:00
|
|
|
bool Signed = isa<SExtInst>(OpA);
|
|
|
|
|
2018-07-20 22:10:04 +02:00
|
|
|
// At this point A could be a function parameter, i.e. not an instruction
|
|
|
|
Value *ValA = OpA->getOperand(0);
|
2016-07-01 01:11:38 +02:00
|
|
|
OpB = dyn_cast<Instruction>(OpB->getOperand(0));
|
2018-07-20 22:10:04 +02:00
|
|
|
if (!OpB || ValA->getType() != OpB->getType())
|
2016-07-01 01:11:38 +02:00
|
|
|
return false;
|
|
|
|
|
2018-07-20 22:10:04 +02:00
|
|
|
// Now we need to prove that adding IdxDiff to ValA won't overflow.
|
2016-07-01 04:16:24 +02:00
|
|
|
bool Safe = false;
|
2020-05-18 21:11:46 +02:00
|
|
|
|
2018-07-20 22:10:04 +02:00
|
|
|
// First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
|
|
|
|
// ValA, we're okay.
|
2016-07-01 04:16:24 +02:00
|
|
|
if (OpB->getOpcode() == Instruction::Add &&
|
|
|
|
isa<ConstantInt>(OpB->getOperand(1)) &&
|
2020-05-18 21:11:46 +02:00
|
|
|
IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
|
2021-06-11 01:01:01 +02:00
|
|
|
checkNoWrapFlags(OpB, Signed))
|
2020-05-18 21:11:46 +02:00
|
|
|
Safe = true;
|
|
|
|
|
2021-06-11 01:01:01 +02:00
|
|
|
// Second attempt: check if we have eligible add NSW/NUW instruction
|
|
|
|
// sequences.
|
2020-05-18 21:11:46 +02:00
|
|
|
OpA = dyn_cast<Instruction>(ValA);
|
|
|
|
if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
|
2021-06-11 01:01:01 +02:00
|
|
|
OpB->getOpcode() == Instruction::Add && checkNoWrapFlags(OpA, Signed) &&
|
|
|
|
checkNoWrapFlags(OpB, Signed)) {
|
|
|
|
// In the checks below a matching operand in OpA and OpB is
|
|
|
|
// an operand which is the same in those two instructions.
|
|
|
|
// Below we account for possible orders of the operands of
|
|
|
|
// these add instructions.
|
|
|
|
for (unsigned MatchingOpIdxA : {0, 1})
|
|
|
|
for (unsigned MatchingOpIdxB : {0, 1})
|
|
|
|
if (!Safe)
|
|
|
|
Safe = checkIfSafeAddSequence(IdxDiff, OpA, MatchingOpIdxA, OpB,
|
|
|
|
MatchingOpIdxB, Signed);
|
2016-07-01 04:16:24 +02:00
|
|
|
}
|
|
|
|
|
2018-07-20 22:10:04 +02:00
|
|
|
unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
|
2016-07-01 04:16:24 +02:00
|
|
|
|
2020-05-18 21:11:46 +02:00
|
|
|
// Third attempt:
|
2018-07-20 22:10:04 +02:00
|
|
|
// If all set bits of IdxDiff or any higher order bit other than the sign bit
|
|
|
|
// are known to be zero in ValA, we can add Diff to it while guaranteeing no
|
|
|
|
// overflow of any sort.
|
2016-07-01 04:16:24 +02:00
|
|
|
if (!Safe) {
|
2017-04-26 18:39:58 +02:00
|
|
|
KnownBits Known(BitWidth);
|
2021-05-13 00:29:29 +02:00
|
|
|
computeKnownBits(ValA, Known, DL, 0, &AC, OpB, &DT);
|
2018-07-20 22:10:04 +02:00
|
|
|
APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
|
|
|
|
if (Signed)
|
|
|
|
BitsAllowedToBeSet.clearBit(BitWidth - 1);
|
|
|
|
if (BitsAllowedToBeSet.ult(IdxDiff))
|
|
|
|
return false;
|
2016-07-01 04:16:24 +02:00
|
|
|
}
|
|
|
|
|
2018-07-20 22:10:04 +02:00
|
|
|
const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
|
2016-07-01 01:11:38 +02:00
|
|
|
const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
|
2018-07-20 22:10:04 +02:00
|
|
|
const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
|
|
|
|
const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
|
|
|
|
return X == OffsetSCEVB;
|
2016-07-01 01:11:38 +02:00
|
|
|
}
|
|
|
|
|
2018-07-26 03:11:36 +02:00
|
|
|
bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
|
|
|
|
const APInt &PtrDelta,
|
2018-07-25 23:33:00 +02:00
|
|
|
unsigned Depth) const {
|
|
|
|
if (Depth++ == MaxDepth)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
|
|
|
|
if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
|
|
|
|
return SelectA->getCondition() == SelectB->getCondition() &&
|
|
|
|
areConsecutivePointers(SelectA->getTrueValue(),
|
|
|
|
SelectB->getTrueValue(), PtrDelta, Depth) &&
|
|
|
|
areConsecutivePointers(SelectA->getFalseValue(),
|
|
|
|
SelectB->getFalseValue(), PtrDelta, Depth);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-07-01 01:11:38 +02:00
|
|
|
void Vectorizer::reorder(Instruction *I) {
|
Correct ordering of loads/stores.
Summary:
Aiming to correct the ordering of loads/stores. This patch changes the
insert point for loads to the position of the first load.
It updates the ordering method for loads to insert before, rather than after.
Before this patch the following sequence:
"load a[1], store a[1], store a[0], load a[2]"
Would incorrectly vectorize to "store a[0,1], load a[1,2]".
The correctness check was assuming the insertion point for loads is at
the position of the first load, when in practice it was at the last
load. An alternative fix would have been to invert the correctness check.
The current fix changes insert position but also requires reordering of
instructions before the vectorized load.
Updated testcases to reflect the changes.
Reviewers: tstellarAMD, llvm-commits, jlebar, arsenm
Subscribers: mzolotukhin
Differential Revision: http://reviews.llvm.org/D22071
llvm-svn: 275117
2016-07-12 00:34:29 +02:00
|
|
|
SmallPtrSet<Instruction *, 16> InstructionsToMove;
|
|
|
|
SmallVector<Instruction *, 16> Worklist;
|
|
|
|
|
|
|
|
Worklist.push_back(I);
|
|
|
|
while (!Worklist.empty()) {
|
|
|
|
Instruction *IW = Worklist.pop_back_val();
|
|
|
|
int NumOperands = IW->getNumOperands();
|
|
|
|
for (int i = 0; i < NumOperands; i++) {
|
|
|
|
Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
|
|
|
|
if (!IM || IM->getOpcode() == Instruction::PHI)
|
|
|
|
continue;
|
2016-07-01 01:11:38 +02:00
|
|
|
|
2016-08-13 02:04:08 +02:00
|
|
|
// If IM is in another BB, no need to move it, because this pass only
|
|
|
|
// vectorizes instructions within one BB.
|
|
|
|
if (IM->getParent() != I->getParent())
|
|
|
|
continue;
|
|
|
|
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
if (!IM->comesBefore(I)) {
|
Correct ordering of loads/stores.
Summary:
Aiming to correct the ordering of loads/stores. This patch changes the
insert point for loads to the position of the first load.
It updates the ordering method for loads to insert before, rather than after.
Before this patch the following sequence:
"load a[1], store a[1], store a[0], load a[2]"
Would incorrectly vectorize to "store a[0,1], load a[1,2]".
The correctness check was assuming the insertion point for loads is at
the position of the first load, when in practice it was at the last
load. An alternative fix would have been to invert the correctness check.
The current fix changes insert position but also requires reordering of
instructions before the vectorized load.
Updated testcases to reflect the changes.
Reviewers: tstellarAMD, llvm-commits, jlebar, arsenm
Subscribers: mzolotukhin
Differential Revision: http://reviews.llvm.org/D22071
llvm-svn: 275117
2016-07-12 00:34:29 +02:00
|
|
|
InstructionsToMove.insert(IM);
|
|
|
|
Worklist.push_back(IM);
|
|
|
|
}
|
2016-07-01 01:11:38 +02:00
|
|
|
}
|
|
|
|
}
|
Correct ordering of loads/stores.
Summary:
Aiming to correct the ordering of loads/stores. This patch changes the
insert point for loads to the position of the first load.
It updates the ordering method for loads to insert before, rather than after.
Before this patch the following sequence:
"load a[1], store a[1], store a[0], load a[2]"
Would incorrectly vectorize to "store a[0,1], load a[1,2]".
The correctness check was assuming the insertion point for loads is at
the position of the first load, when in practice it was at the last
load. An alternative fix would have been to invert the correctness check.
The current fix changes insert position but also requires reordering of
instructions before the vectorized load.
Updated testcases to reflect the changes.
Reviewers: tstellarAMD, llvm-commits, jlebar, arsenm
Subscribers: mzolotukhin
Differential Revision: http://reviews.llvm.org/D22071
llvm-svn: 275117
2016-07-12 00:34:29 +02:00
|
|
|
|
|
|
|
// All instructions to move should follow I. Start from I, not from begin().
|
|
|
|
for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
|
|
|
|
++BBI) {
|
2016-08-02 11:35:17 +02:00
|
|
|
if (!InstructionsToMove.count(&*BBI))
|
Correct ordering of loads/stores.
Summary:
Aiming to correct the ordering of loads/stores. This patch changes the
insert point for loads to the position of the first load.
It updates the ordering method for loads to insert before, rather than after.
Before this patch the following sequence:
"load a[1], store a[1], store a[0], load a[2]"
Would incorrectly vectorize to "store a[0,1], load a[1,2]".
The correctness check was assuming the insertion point for loads is at
the position of the first load, when in practice it was at the last
load. An alternative fix would have been to invert the correctness check.
The current fix changes insert position but also requires reordering of
instructions before the vectorized load.
Updated testcases to reflect the changes.
Reviewers: tstellarAMD, llvm-commits, jlebar, arsenm
Subscribers: mzolotukhin
Differential Revision: http://reviews.llvm.org/D22071
llvm-svn: 275117
2016-07-12 00:34:29 +02:00
|
|
|
continue;
|
|
|
|
Instruction *IM = &*BBI;
|
|
|
|
--BBI;
|
|
|
|
IM->removeFromParent();
|
|
|
|
IM->insertBefore(I);
|
|
|
|
}
|
2016-07-01 01:11:38 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
std::pair<BasicBlock::iterator, BasicBlock::iterator>
|
2016-07-28 01:06:00 +02:00
|
|
|
Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
|
|
|
|
Instruction *C0 = Chain[0];
|
2016-07-01 01:11:38 +02:00
|
|
|
BasicBlock::iterator FirstInstr = C0->getIterator();
|
|
|
|
BasicBlock::iterator LastInstr = C0->getIterator();
|
|
|
|
|
|
|
|
BasicBlock *BB = C0->getParent();
|
|
|
|
unsigned NumFound = 0;
|
|
|
|
for (Instruction &I : *BB) {
|
|
|
|
if (!is_contained(Chain, &I))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
++NumFound;
|
|
|
|
if (NumFound == 1) {
|
|
|
|
FirstInstr = I.getIterator();
|
2016-07-01 23:44:12 +02:00
|
|
|
}
|
|
|
|
if (NumFound == Chain.size()) {
|
2016-07-01 01:11:38 +02:00
|
|
|
LastInstr = I.getIterator();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-01 23:44:12 +02:00
|
|
|
// Range is [first, last).
|
|
|
|
return std::make_pair(FirstInstr, ++LastInstr);
|
2016-07-01 01:11:38 +02:00
|
|
|
}
|
|
|
|
|
2016-07-28 01:06:00 +02:00
|
|
|
void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
|
2016-07-01 01:11:38 +02:00
|
|
|
SmallVector<Instruction *, 16> Instrs;
|
2016-07-28 01:06:00 +02:00
|
|
|
for (Instruction *I : Chain) {
|
2018-03-09 22:05:58 +01:00
|
|
|
Value *PtrOperand = getLoadStorePointerOperand(I);
|
2016-07-01 01:11:38 +02:00
|
|
|
assert(PtrOperand && "Instruction must have a pointer operand.");
|
2016-07-28 01:06:00 +02:00
|
|
|
Instrs.push_back(I);
|
2016-07-01 01:11:38 +02:00
|
|
|
if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
|
|
|
|
Instrs.push_back(GEP);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Erase instructions.
|
2016-07-28 01:06:00 +02:00
|
|
|
for (Instruction *I : Instrs)
|
|
|
|
if (I->use_empty())
|
|
|
|
I->eraseFromParent();
|
2016-07-01 01:11:38 +02:00
|
|
|
}
|
|
|
|
|
2016-07-28 01:06:00 +02:00
|
|
|
std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
|
|
|
|
Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
|
2016-07-01 01:11:38 +02:00
|
|
|
unsigned ElementSizeBits) {
|
2016-10-03 12:31:34 +02:00
|
|
|
unsigned ElementSizeBytes = ElementSizeBits / 8;
|
|
|
|
unsigned SizeBytes = ElementSizeBytes * Chain.size();
|
|
|
|
unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
|
2017-02-23 04:58:53 +01:00
|
|
|
if (NumLeft == Chain.size()) {
|
|
|
|
if ((NumLeft & 1) == 0)
|
|
|
|
NumLeft /= 2; // Split even in half
|
|
|
|
else
|
|
|
|
--NumLeft; // Split off last element
|
|
|
|
} else if (NumLeft == 0)
|
2016-10-03 12:31:34 +02:00
|
|
|
NumLeft = 1;
|
2016-07-01 01:11:38 +02:00
|
|
|
return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
|
|
|
|
}
|
|
|
|
|
2016-07-28 01:06:00 +02:00
|
|
|
ArrayRef<Instruction *>
|
|
|
|
Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
|
2016-07-20 02:55:12 +02:00
|
|
|
// These are in BB order, unlike Chain, which is in address order.
|
2016-08-13 02:04:08 +02:00
|
|
|
SmallVector<Instruction *, 16> MemoryInstrs;
|
|
|
|
SmallVector<Instruction *, 16> ChainInstrs;
|
2016-07-01 01:11:38 +02:00
|
|
|
|
2016-07-20 22:07:37 +02:00
|
|
|
bool IsLoadChain = isa<LoadInst>(Chain[0]);
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG({
|
2016-07-28 01:06:00 +02:00
|
|
|
for (Instruction *I : Chain) {
|
2016-07-20 22:07:37 +02:00
|
|
|
if (IsLoadChain)
|
2016-07-28 01:06:00 +02:00
|
|
|
assert(isa<LoadInst>(I) &&
|
2016-07-20 22:07:37 +02:00
|
|
|
"All elements of Chain must be loads, or all must be stores.");
|
|
|
|
else
|
2016-07-28 01:06:00 +02:00
|
|
|
assert(isa<StoreInst>(I) &&
|
2016-07-20 22:07:37 +02:00
|
|
|
"All elements of Chain must be loads, or all must be stores.");
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
2016-07-20 01:19:20 +02:00
|
|
|
for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
|
2016-07-01 01:11:38 +02:00
|
|
|
if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
|
2016-07-20 01:19:18 +02:00
|
|
|
if (!is_contained(Chain, &I))
|
2016-08-13 02:04:08 +02:00
|
|
|
MemoryInstrs.push_back(&I);
|
2016-07-01 01:11:38 +02:00
|
|
|
else
|
2016-08-13 02:04:08 +02:00
|
|
|
ChainInstrs.push_back(&I);
|
Add an @llvm.sideeffect intrinsic
This patch implements Chandler's idea [0] for supporting languages that
require support for infinite loops with side effects, such as Rust, providing
part of a solution to bug 965 [1].
Specifically, it adds an `llvm.sideeffect()` intrinsic, which has no actual
effect, but which appears to optimization passes to have obscure side effects,
such that they don't optimize away loops containing it. It also teaches
several optimization passes to ignore this intrinsic, so that it doesn't
significantly impact optimization in most cases.
As discussed on llvm-dev [2], this patch is the first of two major parts.
The second part, to change LLVM's semantics to have defined behavior
on infinite loops by default, with a function attribute for opting into
potential-undefined-behavior, will be implemented and posted for review in
a separate patch.
[0] http://lists.llvm.org/pipermail/llvm-dev/2015-July/088103.html
[1] https://bugs.llvm.org/show_bug.cgi?id=965
[2] http://lists.llvm.org/pipermail/llvm-dev/2017-October/118632.html
Differential Revision: https://reviews.llvm.org/D38336
llvm-svn: 317729
2017-11-08 22:59:51 +01:00
|
|
|
} else if (isa<IntrinsicInst>(&I) &&
|
|
|
|
cast<IntrinsicInst>(&I)->getIntrinsicID() ==
|
|
|
|
Intrinsic::sideeffect) {
|
|
|
|
// Ignore llvm.sideeffect calls.
|
[CSSPGO] IR intrinsic for pseudo-probe block instrumentation
This change introduces a new IR intrinsic named `llvm.pseudoprobe` for pseudo-probe block instrumentation. Please refer to https://reviews.llvm.org/D86193 for the whole story.
A pseudo probe is used to collect the execution count of the block where the probe is instrumented. This requires a pseudo probe to be persisting. The LLVM PGO instrumentation also instruments in similar places by placing a counter in the form of atomic read/write operations or runtime helper calls. While these operations are very persisting or optimization-resilient, in theory we can borrow the atomic read/write implementation from PGO counters and cut it off at the end of compilation with all the atomics converted into binary data. This was our initial design and we’ve seen promising sample correlation quality with it. However, the atomics approach has a couple issues:
1. IR Optimizations are blocked unexpectedly. Those atomic instructions are not going to be physically present in the binary code, but since they are on the IR till very end of compilation, they can still prevent certain IR optimizations and result in lower code quality.
2. The counter atomics may not be fully cleaned up from the code stream eventually.
3. Extra work is needed for re-targeting.
We choose to implement pseudo probes based on a special LLVM intrinsic, which is expected to have most of the semantics that comes with an atomic operation but does not block desired optimizations as much as possible. More specifically the semantics associated with the new intrinsic enforces a pseudo probe to be virtually executed exactly the same number of times before and after an IR optimization. The intrinsic also comes with certain flags that are carefully chosen so that the places they are probing are not going to be messed up by the optimizer while most of the IR optimizations still work. The core flags given to the special intrinsic is `IntrInaccessibleMemOnly`, which means the intrinsic accesses memory and does have a side effect so that it is not removable, but is does not access memory locations that are accessible by any original instructions. This way the intrinsic does not alias with any original instruction and thus it does not block optimizations as much as an atomic operation does. We also assign a function GUID and a block index to an intrinsic so that they are uniquely identified and not merged in order to achieve good correlation quality.
Let's now look at an example. Given the following LLVM IR:
```
define internal void @foo2(i32 %x, void (i32)* %f) !dbg !4 {
bb0:
%cmp = icmp eq i32 %x, 0
br i1 %cmp, label %bb1, label %bb2
bb1:
br label %bb3
bb2:
br label %bb3
bb3:
ret void
}
```
The instrumented IR will look like below. Note that each `llvm.pseudoprobe` intrinsic call represents a pseudo probe at a block, of which the first parameter is the GUID of the probe’s owner function and the second parameter is the probe’s ID.
```
define internal void @foo2(i32 %x, void (i32)* %f) !dbg !4 {
bb0:
%cmp = icmp eq i32 %x, 0
call void @llvm.pseudoprobe(i64 837061429793323041, i64 1)
br i1 %cmp, label %bb1, label %bb2
bb1:
call void @llvm.pseudoprobe(i64 837061429793323041, i64 2)
br label %bb3
bb2:
call void @llvm.pseudoprobe(i64 837061429793323041, i64 3)
br label %bb3
bb3:
call void @llvm.pseudoprobe(i64 837061429793323041, i64 4)
ret void
}
```
Reviewed By: wmi
Differential Revision: https://reviews.llvm.org/D86490
2020-11-18 21:42:51 +01:00
|
|
|
} else if (isa<IntrinsicInst>(&I) &&
|
|
|
|
cast<IntrinsicInst>(&I)->getIntrinsicID() ==
|
|
|
|
Intrinsic::pseudoprobe) {
|
|
|
|
// Ignore llvm.pseudoprobe calls.
|
2021-05-13 00:29:29 +02:00
|
|
|
} else if (isa<IntrinsicInst>(&I) &&
|
|
|
|
cast<IntrinsicInst>(&I)->getIntrinsicID() == Intrinsic::assume) {
|
|
|
|
// Ignore llvm.assume calls.
|
2016-07-20 22:07:37 +02:00
|
|
|
} else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
|
|
|
|
<< '\n');
|
2016-07-20 22:07:37 +02:00
|
|
|
break;
|
|
|
|
} else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
|
|
|
|
<< '\n');
|
2016-07-20 22:07:34 +02:00
|
|
|
break;
|
2016-07-01 01:11:38 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-20 02:55:12 +02:00
|
|
|
// Loop until we find an instruction in ChainInstrs that we can't vectorize.
|
2016-08-13 02:04:08 +02:00
|
|
|
unsigned ChainInstrIdx = 0;
|
2016-11-23 18:43:15 +01:00
|
|
|
Instruction *BarrierMemoryInstr = nullptr;
|
|
|
|
|
2016-08-13 02:04:08 +02:00
|
|
|
for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
|
|
|
|
Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
|
2016-11-23 18:43:15 +01:00
|
|
|
|
|
|
|
// If a barrier memory instruction was found, chain instructions that follow
|
|
|
|
// will not be added to the valid prefix.
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr))
|
2016-11-23 18:43:15 +01:00
|
|
|
break;
|
|
|
|
|
|
|
|
// Check (in BB order) if any instruction prevents ChainInstr from being
|
|
|
|
// vectorized. Find and store the first such "conflicting" instruction.
|
2016-08-13 02:04:08 +02:00
|
|
|
for (Instruction *MemInstr : MemoryInstrs) {
|
2016-11-23 18:43:15 +01:00
|
|
|
// If a barrier memory instruction was found, do not check past it.
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr))
|
2016-11-23 18:43:15 +01:00
|
|
|
break;
|
|
|
|
|
2018-04-24 17:28:47 +02:00
|
|
|
auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
|
|
|
|
auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
|
|
|
|
if (MemLoad && ChainLoad)
|
2016-07-01 01:11:38 +02:00
|
|
|
continue;
|
|
|
|
|
2018-04-24 17:28:47 +02:00
|
|
|
// We can ignore the alias if the we have a load store pair and the load
|
|
|
|
// is known to be invariant. The load cannot be clobbered by the store.
|
|
|
|
auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
|
2019-09-04 20:27:31 +02:00
|
|
|
return LI->hasMetadata(LLVMContext::MD_invariant_load);
|
2018-04-24 17:28:47 +02:00
|
|
|
};
|
|
|
|
|
2016-07-01 01:11:38 +02:00
|
|
|
// We can ignore the alias as long as the load comes before the store,
|
|
|
|
// because that means we won't be moving the load past the store to
|
|
|
|
// vectorize it (the vectorized load is inserted at the location of the
|
|
|
|
// first load in the chain).
|
2018-04-24 17:28:47 +02:00
|
|
|
if (isa<StoreInst>(MemInstr) && ChainLoad &&
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
(IsInvariantLoad(ChainLoad) || ChainLoad->comesBefore(MemInstr)))
|
2016-07-01 01:11:38 +02:00
|
|
|
continue;
|
|
|
|
|
|
|
|
// Same case, but in reverse.
|
2018-04-24 17:28:47 +02:00
|
|
|
if (MemLoad && isa<StoreInst>(ChainInstr) &&
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
(IsInvariantLoad(MemLoad) || MemLoad->comesBefore(ChainInstr)))
|
2016-07-01 01:11:38 +02:00
|
|
|
continue;
|
|
|
|
|
2016-07-28 01:06:00 +02:00
|
|
|
if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
|
|
|
|
MemoryLocation::get(ChainInstr))) {
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG({
|
2016-07-20 01:19:18 +02:00
|
|
|
dbgs() << "LSV: Found alias:\n"
|
|
|
|
" Aliasing instruction and pointer:\n"
|
2016-07-20 02:55:12 +02:00
|
|
|
<< " " << *MemInstr << '\n'
|
2018-03-09 22:05:58 +01:00
|
|
|
<< " " << *getLoadStorePointerOperand(MemInstr) << '\n'
|
2016-07-20 01:19:18 +02:00
|
|
|
<< " Aliased instruction and pointer:\n"
|
2016-07-20 02:55:12 +02:00
|
|
|
<< " " << *ChainInstr << '\n'
|
2018-03-09 22:05:58 +01:00
|
|
|
<< " " << *getLoadStorePointerOperand(ChainInstr) << '\n';
|
2016-07-07 22:10:35 +02:00
|
|
|
});
|
2016-11-23 18:43:15 +01:00
|
|
|
// Save this aliasing memory instruction as a barrier, but allow other
|
|
|
|
// instructions that precede the barrier to be vectorized with this one.
|
|
|
|
BarrierMemoryInstr = MemInstr;
|
2016-07-20 02:55:12 +02:00
|
|
|
break;
|
2016-07-01 01:11:38 +02:00
|
|
|
}
|
|
|
|
}
|
2016-11-23 18:43:15 +01:00
|
|
|
// Continue the search only for store chains, since vectorizing stores that
|
|
|
|
// precede an aliasing load is valid. Conversely, vectorizing loads is valid
|
|
|
|
// up to an aliasing store, but should not pull loads from further down in
|
|
|
|
// the basic block.
|
|
|
|
if (IsLoadChain && BarrierMemoryInstr) {
|
|
|
|
// The BarrierMemoryInstr is a store that precedes ChainInstr.
|
[IR] Lazily number instructions for local dominance queries
Essentially, fold OrderedBasicBlock into BasicBlock, and make it
auto-invalidate the instruction ordering when new instructions are
added. Notably, we don't need to invalidate it when removing
instructions, which is helpful when a pass mostly delete dead
instructions rather than transforming them.
The downside is that Instruction grows from 56 bytes to 64 bytes. The
resulting LLVM code is substantially simpler and automatically handles
invalidation, which makes me think that this is the right speed and size
tradeoff.
The important change is in SymbolTableTraitsImpl.h, where the numbering
is invalidated. Everything else should be straightforward.
We probably want to implement a fancier re-numbering scheme so that
local updates don't invalidate the ordering, but I plan for that to be
future work, maybe for someone else.
Reviewed By: lattner, vsk, fhahn, dexonsmith
Differential Revision: https://reviews.llvm.org/D51664
2020-02-18 23:33:54 +01:00
|
|
|
assert(BarrierMemoryInstr->comesBefore(ChainInstr));
|
2016-07-20 02:55:12 +02:00
|
|
|
break;
|
2016-11-23 18:43:15 +01:00
|
|
|
}
|
2016-07-20 02:55:12 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Find the largest prefix of Chain whose elements are all in
|
|
|
|
// ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
|
|
|
|
// Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
|
|
|
|
// order.)
|
2016-08-13 02:04:12 +02:00
|
|
|
SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
|
|
|
|
ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
|
|
|
|
unsigned ChainIdx = 0;
|
|
|
|
for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
|
|
|
|
if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
|
2016-07-20 02:55:12 +02:00
|
|
|
break;
|
2016-07-01 01:11:38 +02:00
|
|
|
}
|
2016-07-20 02:55:12 +02:00
|
|
|
return Chain.slice(0, ChainIdx);
|
2016-07-01 01:11:38 +02:00
|
|
|
}
|
|
|
|
|
2020-07-31 11:09:54 +02:00
|
|
|
static ChainID getChainID(const Value *Ptr) {
|
|
|
|
const Value *ObjPtr = getUnderlyingObject(Ptr);
|
2018-07-25 23:33:00 +02:00
|
|
|
if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
|
|
|
|
// The select's themselves are distinct instructions even if they share the
|
|
|
|
// same condition and evaluate to consecutive pointers for true and false
|
|
|
|
// values of the condition. Therefore using the select's themselves for
|
|
|
|
// grouping instructions would put consecutive accesses into different lists
|
|
|
|
// and they won't be even checked for being consecutive, and won't be
|
|
|
|
// vectorized.
|
|
|
|
return Sel->getCondition();
|
|
|
|
}
|
|
|
|
return ObjPtr;
|
|
|
|
}
|
|
|
|
|
2016-07-28 01:06:00 +02:00
|
|
|
std::pair<InstrListMap, InstrListMap>
|
2016-07-20 01:19:16 +02:00
|
|
|
Vectorizer::collectInstructions(BasicBlock *BB) {
|
2016-07-28 01:06:00 +02:00
|
|
|
InstrListMap LoadRefs;
|
|
|
|
InstrListMap StoreRefs;
|
2016-07-01 01:11:38 +02:00
|
|
|
|
|
|
|
for (Instruction &I : *BB) {
|
|
|
|
if (!I.mayReadOrWriteMemory())
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
|
|
|
|
if (!LI->isSimple())
|
|
|
|
continue;
|
|
|
|
|
2016-10-03 12:31:34 +02:00
|
|
|
// Skip if it's not legal.
|
|
|
|
if (!TTI.isLegalToVectorizeLoad(LI))
|
|
|
|
continue;
|
|
|
|
|
2016-07-01 01:11:38 +02:00
|
|
|
Type *Ty = LI->getType();
|
|
|
|
if (!VectorType::isValidElementType(Ty->getScalarType()))
|
|
|
|
continue;
|
|
|
|
|
2016-07-01 02:36:54 +02:00
|
|
|
// Skip weird non-byte sizes. They probably aren't worth the effort of
|
|
|
|
// handling correctly.
|
|
|
|
unsigned TySize = DL.getTypeSizeInBits(Ty);
|
[LSV] Skip all non-byte sizes, not only less than eight bits
Summary:
The code comments indicate that no effort has been spent on
handling load/stores when the size isn't a multiple of the
byte size correctly. However, the code only avoided types
smaller than 8 bits. So for example a load of an i28 could
still be considered as a candidate for vectorization.
This patch adjusts the code to behave according to the code
comment.
The test case used to hit the following assert when
trying to use "cast" an i32 to i28 using CreateBitOrPointerCast:
opt: ../lib/IR/Instructions.cpp:2565: Assertion `castIsValid(op, S, Ty) && "Invalid cast!"' failed.
#0 PrintStackTraceSignalHandler(void*)
#1 SignalHandler(int)
#2 __restore_rt
#3 __GI_raise
#4 __GI_abort
#5 __GI___assert_fail
#6 llvm::CastInst::Create(llvm::Instruction::CastOps, llvm::Value*, llvm::Type*, llvm::Twine const&, llvm::Instruction*)
#7 llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>::CreateBitOrPointerCast(llvm::Value*, llvm::Type*, llvm::Twine const&)
#8 (anonymous namespace)::Vectorizer::vectorizeLoadChain(llvm::ArrayRef<llvm::Instruction*>, llvm::SmallPtrSet<llvm::Instruction*, 16u>*)
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D39295
llvm-svn: 316663
2017-10-26 15:42:55 +02:00
|
|
|
if ((TySize % 8) != 0)
|
2016-07-01 02:36:54 +02:00
|
|
|
continue;
|
|
|
|
|
[LSV] Avoid adding vectors of pointers as candidates
Summary:
We no longer add vectors of pointers as candidates for
load/store vectorization. It does not seem to work anyway,
but without this patch we can end up in asserts when trying
to create casts between an integer type and the pointer of
vectors type.
The test case I've added used to assert like this when trying to
cast between i64 and <2 x i16*>:
opt: ../lib/IR/Instructions.cpp:2565: Assertion `castIsValid(op, S, Ty) && "Invalid cast!"' failed.
#0 PrintStackTraceSignalHandler(void*)
#1 SignalHandler(int)
#2 __restore_rt
#3 __GI_raise
#4 __GI_abort
#5 __GI___assert_fail
#6 llvm::CastInst::Create(llvm::Instruction::CastOps, llvm::Value*, llvm::Type*, llvm::Twine const&, llvm::Instruction*)
#7 llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>::CreateBitOrPointerCast(llvm::Value*, llvm::Type*, llvm::Twine const&)
#8 Vectorizer::vectorizeStoreChain(llvm::ArrayRef<llvm::Instruction*>, llvm::SmallPtrSet<llvm::Instruction*, 16u>*)
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: nhaehnle, llvm-commits
Differential Revision: https://reviews.llvm.org/D39296
llvm-svn: 316665
2017-10-26 15:59:15 +02:00
|
|
|
// Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
|
|
|
|
// functions are currently using an integer type for the vectorized
|
|
|
|
// load/store, and does not support casting between the integer type and a
|
|
|
|
// vector of pointers (e.g. i64 to <2 x i16*>)
|
|
|
|
if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
|
|
|
|
continue;
|
|
|
|
|
2016-07-01 04:07:22 +02:00
|
|
|
Value *Ptr = LI->getPointerOperand();
|
|
|
|
unsigned AS = Ptr->getType()->getPointerAddressSpace();
|
|
|
|
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
|
|
|
|
|
2018-03-07 18:09:18 +01:00
|
|
|
unsigned VF = VecRegSize / TySize;
|
|
|
|
VectorType *VecTy = dyn_cast<VectorType>(Ty);
|
|
|
|
|
2016-07-01 01:11:38 +02:00
|
|
|
// No point in looking at these if they're too big to vectorize.
|
2018-03-07 18:09:18 +01:00
|
|
|
if (TySize > VecRegSize / 2 ||
|
|
|
|
(VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
|
2016-07-01 01:11:38 +02:00
|
|
|
continue;
|
|
|
|
|
|
|
|
// Make sure all the users of a vector are constant-index extracts.
|
2017-10-17 23:27:42 +02:00
|
|
|
if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
|
2016-07-28 01:06:00 +02:00
|
|
|
const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
|
|
|
|
return EEI && isa<ConstantInt>(EEI->getOperand(1));
|
2016-07-01 01:11:38 +02:00
|
|
|
}))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// Save the load locations.
|
2020-07-31 11:09:54 +02:00
|
|
|
const ChainID ID = getChainID(Ptr);
|
2018-07-25 23:33:00 +02:00
|
|
|
LoadRefs[ID].push_back(LI);
|
2016-07-01 01:11:38 +02:00
|
|
|
} else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
|
|
|
|
if (!SI->isSimple())
|
|
|
|
continue;
|
|
|
|
|
2016-10-03 12:31:34 +02:00
|
|
|
// Skip if it's not legal.
|
|
|
|
if (!TTI.isLegalToVectorizeStore(SI))
|
|
|
|
continue;
|
|
|
|
|
2016-07-01 01:11:38 +02:00
|
|
|
Type *Ty = SI->getValueOperand()->getType();
|
|
|
|
if (!VectorType::isValidElementType(Ty->getScalarType()))
|
|
|
|
continue;
|
|
|
|
|
[LSV] Avoid adding vectors of pointers as candidates
Summary:
We no longer add vectors of pointers as candidates for
load/store vectorization. It does not seem to work anyway,
but without this patch we can end up in asserts when trying
to create casts between an integer type and the pointer of
vectors type.
The test case I've added used to assert like this when trying to
cast between i64 and <2 x i16*>:
opt: ../lib/IR/Instructions.cpp:2565: Assertion `castIsValid(op, S, Ty) && "Invalid cast!"' failed.
#0 PrintStackTraceSignalHandler(void*)
#1 SignalHandler(int)
#2 __restore_rt
#3 __GI_raise
#4 __GI_abort
#5 __GI___assert_fail
#6 llvm::CastInst::Create(llvm::Instruction::CastOps, llvm::Value*, llvm::Type*, llvm::Twine const&, llvm::Instruction*)
#7 llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>::CreateBitOrPointerCast(llvm::Value*, llvm::Type*, llvm::Twine const&)
#8 Vectorizer::vectorizeStoreChain(llvm::ArrayRef<llvm::Instruction*>, llvm::SmallPtrSet<llvm::Instruction*, 16u>*)
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: nhaehnle, llvm-commits
Differential Revision: https://reviews.llvm.org/D39296
llvm-svn: 316665
2017-10-26 15:59:15 +02:00
|
|
|
// Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
|
|
|
|
// functions are currently using an integer type for the vectorized
|
|
|
|
// load/store, and does not support casting between the integer type and a
|
|
|
|
// vector of pointers (e.g. i64 to <2 x i16*>)
|
|
|
|
if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
|
|
|
|
continue;
|
|
|
|
|
2016-07-01 02:36:54 +02:00
|
|
|
// Skip weird non-byte sizes. They probably aren't worth the effort of
|
|
|
|
// handling correctly.
|
|
|
|
unsigned TySize = DL.getTypeSizeInBits(Ty);
|
[LSV] Skip all non-byte sizes, not only less than eight bits
Summary:
The code comments indicate that no effort has been spent on
handling load/stores when the size isn't a multiple of the
byte size correctly. However, the code only avoided types
smaller than 8 bits. So for example a load of an i28 could
still be considered as a candidate for vectorization.
This patch adjusts the code to behave according to the code
comment.
The test case used to hit the following assert when
trying to use "cast" an i32 to i28 using CreateBitOrPointerCast:
opt: ../lib/IR/Instructions.cpp:2565: Assertion `castIsValid(op, S, Ty) && "Invalid cast!"' failed.
#0 PrintStackTraceSignalHandler(void*)
#1 SignalHandler(int)
#2 __restore_rt
#3 __GI_raise
#4 __GI_abort
#5 __GI___assert_fail
#6 llvm::CastInst::Create(llvm::Instruction::CastOps, llvm::Value*, llvm::Type*, llvm::Twine const&, llvm::Instruction*)
#7 llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>::CreateBitOrPointerCast(llvm::Value*, llvm::Type*, llvm::Twine const&)
#8 (anonymous namespace)::Vectorizer::vectorizeLoadChain(llvm::ArrayRef<llvm::Instruction*>, llvm::SmallPtrSet<llvm::Instruction*, 16u>*)
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D39295
llvm-svn: 316663
2017-10-26 15:42:55 +02:00
|
|
|
if ((TySize % 8) != 0)
|
2016-07-01 02:36:54 +02:00
|
|
|
continue;
|
|
|
|
|
2016-07-01 04:07:22 +02:00
|
|
|
Value *Ptr = SI->getPointerOperand();
|
|
|
|
unsigned AS = Ptr->getType()->getPointerAddressSpace();
|
|
|
|
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
|
[LSV] Skip all non-byte sizes, not only less than eight bits
Summary:
The code comments indicate that no effort has been spent on
handling load/stores when the size isn't a multiple of the
byte size correctly. However, the code only avoided types
smaller than 8 bits. So for example a load of an i28 could
still be considered as a candidate for vectorization.
This patch adjusts the code to behave according to the code
comment.
The test case used to hit the following assert when
trying to use "cast" an i32 to i28 using CreateBitOrPointerCast:
opt: ../lib/IR/Instructions.cpp:2565: Assertion `castIsValid(op, S, Ty) && "Invalid cast!"' failed.
#0 PrintStackTraceSignalHandler(void*)
#1 SignalHandler(int)
#2 __restore_rt
#3 __GI_raise
#4 __GI_abort
#5 __GI___assert_fail
#6 llvm::CastInst::Create(llvm::Instruction::CastOps, llvm::Value*, llvm::Type*, llvm::Twine const&, llvm::Instruction*)
#7 llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>::CreateBitOrPointerCast(llvm::Value*, llvm::Type*, llvm::Twine const&)
#8 (anonymous namespace)::Vectorizer::vectorizeLoadChain(llvm::ArrayRef<llvm::Instruction*>, llvm::SmallPtrSet<llvm::Instruction*, 16u>*)
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D39295
llvm-svn: 316663
2017-10-26 15:42:55 +02:00
|
|
|
|
2018-03-07 18:09:18 +01:00
|
|
|
unsigned VF = VecRegSize / TySize;
|
|
|
|
VectorType *VecTy = dyn_cast<VectorType>(Ty);
|
|
|
|
|
[LSV] Skip all non-byte sizes, not only less than eight bits
Summary:
The code comments indicate that no effort has been spent on
handling load/stores when the size isn't a multiple of the
byte size correctly. However, the code only avoided types
smaller than 8 bits. So for example a load of an i28 could
still be considered as a candidate for vectorization.
This patch adjusts the code to behave according to the code
comment.
The test case used to hit the following assert when
trying to use "cast" an i32 to i28 using CreateBitOrPointerCast:
opt: ../lib/IR/Instructions.cpp:2565: Assertion `castIsValid(op, S, Ty) && "Invalid cast!"' failed.
#0 PrintStackTraceSignalHandler(void*)
#1 SignalHandler(int)
#2 __restore_rt
#3 __GI_raise
#4 __GI_abort
#5 __GI___assert_fail
#6 llvm::CastInst::Create(llvm::Instruction::CastOps, llvm::Value*, llvm::Type*, llvm::Twine const&, llvm::Instruction*)
#7 llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>::CreateBitOrPointerCast(llvm::Value*, llvm::Type*, llvm::Twine const&)
#8 (anonymous namespace)::Vectorizer::vectorizeLoadChain(llvm::ArrayRef<llvm::Instruction*>, llvm::SmallPtrSet<llvm::Instruction*, 16u>*)
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D39295
llvm-svn: 316663
2017-10-26 15:42:55 +02:00
|
|
|
// No point in looking at these if they're too big to vectorize.
|
2018-03-07 18:09:18 +01:00
|
|
|
if (TySize > VecRegSize / 2 ||
|
|
|
|
(VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
|
2016-07-01 01:11:38 +02:00
|
|
|
continue;
|
|
|
|
|
2017-10-17 23:27:42 +02:00
|
|
|
if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
|
2016-07-28 01:06:00 +02:00
|
|
|
const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
|
|
|
|
return EEI && isa<ConstantInt>(EEI->getOperand(1));
|
2016-07-01 01:11:38 +02:00
|
|
|
}))
|
|
|
|
continue;
|
|
|
|
|
|
|
|
// Save store location.
|
2020-07-31 11:09:54 +02:00
|
|
|
const ChainID ID = getChainID(Ptr);
|
2018-07-25 23:33:00 +02:00
|
|
|
StoreRefs[ID].push_back(SI);
|
2016-07-01 01:11:38 +02:00
|
|
|
}
|
|
|
|
}
|
2016-07-20 01:19:16 +02:00
|
|
|
|
|
|
|
return {LoadRefs, StoreRefs};
|
2016-07-01 01:11:38 +02:00
|
|
|
}
|
|
|
|
|
2016-07-28 01:06:00 +02:00
|
|
|
bool Vectorizer::vectorizeChains(InstrListMap &Map) {
|
2016-07-01 01:11:38 +02:00
|
|
|
bool Changed = false;
|
|
|
|
|
2018-07-25 23:33:00 +02:00
|
|
|
for (const std::pair<ChainID, InstrList> &Chain : Map) {
|
2016-07-01 01:11:38 +02:00
|
|
|
unsigned Size = Chain.second.size();
|
|
|
|
if (Size < 2)
|
|
|
|
continue;
|
|
|
|
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
|
2016-07-01 01:11:38 +02:00
|
|
|
|
|
|
|
// Process the stores in chunks of 64.
|
|
|
|
for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
|
|
|
|
unsigned Len = std::min<unsigned>(CE - CI, 64);
|
2016-07-28 01:06:00 +02:00
|
|
|
ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
|
2016-07-01 01:11:38 +02:00
|
|
|
Changed |= vectorizeInstructions(Chunk);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
2016-07-28 01:06:00 +02:00
|
|
|
bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
|
|
|
|
<< " instructions.\n");
|
2016-08-31 01:53:59 +02:00
|
|
|
SmallVector<int, 16> Heads, Tails;
|
2016-07-01 01:11:38 +02:00
|
|
|
int ConsecutiveChain[64];
|
|
|
|
|
[LSV] Avoid adding vectors of pointers as candidates
Summary:
We no longer add vectors of pointers as candidates for
load/store vectorization. It does not seem to work anyway,
but without this patch we can end up in asserts when trying
to create casts between an integer type and the pointer of
vectors type.
The test case I've added used to assert like this when trying to
cast between i64 and <2 x i16*>:
opt: ../lib/IR/Instructions.cpp:2565: Assertion `castIsValid(op, S, Ty) && "Invalid cast!"' failed.
#0 PrintStackTraceSignalHandler(void*)
#1 SignalHandler(int)
#2 __restore_rt
#3 __GI_raise
#4 __GI_abort
#5 __GI___assert_fail
#6 llvm::CastInst::Create(llvm::Instruction::CastOps, llvm::Value*, llvm::Type*, llvm::Twine const&, llvm::Instruction*)
#7 llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>::CreateBitOrPointerCast(llvm::Value*, llvm::Type*, llvm::Twine const&)
#8 Vectorizer::vectorizeStoreChain(llvm::ArrayRef<llvm::Instruction*>, llvm::SmallPtrSet<llvm::Instruction*, 16u>*)
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: nhaehnle, llvm-commits
Differential Revision: https://reviews.llvm.org/D39296
llvm-svn: 316665
2017-10-26 15:59:15 +02:00
|
|
|
// Do a quadratic search on all of the given loads/stores and find all of the
|
|
|
|
// pairs of loads/stores that follow each other.
|
2016-07-01 01:11:38 +02:00
|
|
|
for (int i = 0, e = Instrs.size(); i < e; ++i) {
|
|
|
|
ConsecutiveChain[i] = -1;
|
|
|
|
for (int j = e - 1; j >= 0; --j) {
|
|
|
|
if (i == j)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
|
|
|
|
if (ConsecutiveChain[i] != -1) {
|
|
|
|
int CurDistance = std::abs(ConsecutiveChain[i] - i);
|
|
|
|
int NewDistance = std::abs(ConsecutiveChain[i] - j);
|
|
|
|
if (j < i || NewDistance > CurDistance)
|
|
|
|
continue; // Should not insert.
|
|
|
|
}
|
|
|
|
|
2016-08-31 01:53:59 +02:00
|
|
|
Tails.push_back(j);
|
|
|
|
Heads.push_back(i);
|
2016-07-01 01:11:38 +02:00
|
|
|
ConsecutiveChain[i] = j;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Changed = false;
|
2016-07-28 01:06:00 +02:00
|
|
|
SmallPtrSet<Instruction *, 16> InstructionsProcessed;
|
2016-07-01 01:11:38 +02:00
|
|
|
|
|
|
|
for (int Head : Heads) {
|
2016-07-13 23:20:01 +02:00
|
|
|
if (InstructionsProcessed.count(Instrs[Head]))
|
|
|
|
continue;
|
2016-08-31 01:53:59 +02:00
|
|
|
bool LongerChainExists = false;
|
2016-07-13 23:20:01 +02:00
|
|
|
for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
|
|
|
|
if (Head == Tails[TIt] &&
|
|
|
|
!InstructionsProcessed.count(Instrs[Heads[TIt]])) {
|
2016-08-31 01:53:59 +02:00
|
|
|
LongerChainExists = true;
|
2016-07-13 23:20:01 +02:00
|
|
|
break;
|
|
|
|
}
|
2016-08-31 01:53:59 +02:00
|
|
|
if (LongerChainExists)
|
2016-07-01 01:11:38 +02:00
|
|
|
continue;
|
|
|
|
|
|
|
|
// We found an instr that starts a chain. Now follow the chain and try to
|
|
|
|
// vectorize it.
|
2016-07-28 01:06:00 +02:00
|
|
|
SmallVector<Instruction *, 16> Operands;
|
2016-07-01 01:11:38 +02:00
|
|
|
int I = Head;
|
2016-08-31 01:53:59 +02:00
|
|
|
while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
|
2016-07-13 23:20:01 +02:00
|
|
|
if (InstructionsProcessed.count(Instrs[I]))
|
2016-07-01 01:11:38 +02:00
|
|
|
break;
|
|
|
|
|
|
|
|
Operands.push_back(Instrs[I]);
|
|
|
|
I = ConsecutiveChain[I];
|
|
|
|
}
|
|
|
|
|
|
|
|
bool Vectorized = false;
|
|
|
|
if (isa<LoadInst>(*Operands.begin()))
|
2016-07-13 23:20:01 +02:00
|
|
|
Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
|
2016-07-01 01:11:38 +02:00
|
|
|
else
|
2016-07-13 23:20:01 +02:00
|
|
|
Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
|
2016-07-01 01:11:38 +02:00
|
|
|
|
|
|
|
Changed |= Vectorized;
|
|
|
|
}
|
|
|
|
|
|
|
|
return Changed;
|
|
|
|
}
|
|
|
|
|
2016-07-13 23:20:01 +02:00
|
|
|
bool Vectorizer::vectorizeStoreChain(
|
2016-07-28 01:06:00 +02:00
|
|
|
ArrayRef<Instruction *> Chain,
|
|
|
|
SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
|
2016-07-01 01:11:38 +02:00
|
|
|
StoreInst *S0 = cast<StoreInst>(Chain[0]);
|
2016-07-01 02:37:01 +02:00
|
|
|
|
[LSV] Avoid adding vectors of pointers as candidates
Summary:
We no longer add vectors of pointers as candidates for
load/store vectorization. It does not seem to work anyway,
but without this patch we can end up in asserts when trying
to create casts between an integer type and the pointer of
vectors type.
The test case I've added used to assert like this when trying to
cast between i64 and <2 x i16*>:
opt: ../lib/IR/Instructions.cpp:2565: Assertion `castIsValid(op, S, Ty) && "Invalid cast!"' failed.
#0 PrintStackTraceSignalHandler(void*)
#1 SignalHandler(int)
#2 __restore_rt
#3 __GI_raise
#4 __GI_abort
#5 __GI___assert_fail
#6 llvm::CastInst::Create(llvm::Instruction::CastOps, llvm::Value*, llvm::Type*, llvm::Twine const&, llvm::Instruction*)
#7 llvm::IRBuilder<llvm::ConstantFolder, llvm::IRBuilderDefaultInserter>::CreateBitOrPointerCast(llvm::Value*, llvm::Type*, llvm::Twine const&)
#8 Vectorizer::vectorizeStoreChain(llvm::ArrayRef<llvm::Instruction*>, llvm::SmallPtrSet<llvm::Instruction*, 16u>*)
Reviewers: arsenm
Reviewed By: arsenm
Subscribers: nhaehnle, llvm-commits
Differential Revision: https://reviews.llvm.org/D39296
llvm-svn: 316665
2017-10-26 15:59:15 +02:00
|
|
|
// If the vector has an int element, default to int for the whole store.
|
2019-05-06 12:25:11 +02:00
|
|
|
Type *StoreTy = nullptr;
|
2016-07-28 01:06:00 +02:00
|
|
|
for (Instruction *I : Chain) {
|
|
|
|
StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
|
2016-07-01 02:37:01 +02:00
|
|
|
if (StoreTy->isIntOrIntVectorTy())
|
|
|
|
break;
|
2016-07-01 03:55:52 +02:00
|
|
|
|
|
|
|
if (StoreTy->isPtrOrPtrVectorTy()) {
|
|
|
|
StoreTy = Type::getIntNTy(F.getParent()->getContext(),
|
|
|
|
DL.getTypeSizeInBits(StoreTy));
|
|
|
|
break;
|
|
|
|
}
|
2016-07-01 02:37:01 +02:00
|
|
|
}
|
2019-05-06 12:25:11 +02:00
|
|
|
assert(StoreTy && "Failed to find store type");
|
2016-07-01 02:37:01 +02:00
|
|
|
|
2016-07-01 01:11:38 +02:00
|
|
|
unsigned Sz = DL.getTypeSizeInBits(StoreTy);
|
2016-07-01 04:07:22 +02:00
|
|
|
unsigned AS = S0->getPointerAddressSpace();
|
|
|
|
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
|
2016-07-01 01:11:38 +02:00
|
|
|
unsigned VF = VecRegSize / Sz;
|
|
|
|
unsigned ChainSize = Chain.size();
|
2020-05-17 22:14:42 +02:00
|
|
|
Align Alignment = S0->getAlign();
|
2016-07-01 01:11:38 +02:00
|
|
|
|
2016-07-13 23:20:01 +02:00
|
|
|
if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
|
|
|
|
InstructionsProcessed->insert(Chain.begin(), Chain.end());
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-07-28 01:06:00 +02:00
|
|
|
ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
|
2016-07-20 01:19:20 +02:00
|
|
|
if (NewChain.empty()) {
|
2016-07-20 22:07:34 +02:00
|
|
|
// No vectorization possible.
|
2016-07-13 23:20:01 +02:00
|
|
|
InstructionsProcessed->insert(Chain.begin(), Chain.end());
|
2016-07-01 01:11:38 +02:00
|
|
|
return false;
|
2016-07-13 23:20:01 +02:00
|
|
|
}
|
2016-07-20 01:19:20 +02:00
|
|
|
if (NewChain.size() == 1) {
|
2016-07-13 23:20:01 +02:00
|
|
|
// Failed after the first instruction. Discard it and try the smaller chain.
|
2016-07-20 01:19:20 +02:00
|
|
|
InstructionsProcessed->insert(NewChain.front());
|
2016-07-13 23:20:01 +02:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Update Chain to the valid vectorizable subchain.
|
2016-07-20 01:19:20 +02:00
|
|
|
Chain = NewChain;
|
2016-07-13 23:20:01 +02:00
|
|
|
ChainSize = Chain.size();
|
2016-07-01 01:11:38 +02:00
|
|
|
|
2016-10-03 12:31:34 +02:00
|
|
|
// Check if it's legal to vectorize this chain. If not, split the chain and
|
|
|
|
// try again.
|
2016-09-10 00:20:14 +02:00
|
|
|
unsigned EltSzInBytes = Sz / 8;
|
|
|
|
unsigned SzInBytes = EltSzInBytes * ChainSize;
|
2016-07-01 01:11:38 +02:00
|
|
|
|
2020-08-27 20:19:46 +02:00
|
|
|
FixedVectorType *VecTy;
|
|
|
|
auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy);
|
2016-07-01 01:11:38 +02:00
|
|
|
if (VecStoreTy)
|
2020-05-29 19:06:26 +02:00
|
|
|
VecTy = FixedVectorType::get(StoreTy->getScalarType(),
|
|
|
|
Chain.size() * VecStoreTy->getNumElements());
|
2016-07-01 01:11:38 +02:00
|
|
|
else
|
2020-05-29 19:06:26 +02:00
|
|
|
VecTy = FixedVectorType::get(StoreTy, Chain.size());
|
2016-07-01 01:11:38 +02:00
|
|
|
|
2016-10-03 12:31:34 +02:00
|
|
|
// If it's more than the max vector size or the target has a better
|
|
|
|
// vector factor, break it into two pieces.
|
|
|
|
unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
|
|
|
|
if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
|
|
|
|
" Creating two separate arrays.\n");
|
2016-10-03 12:31:34 +02:00
|
|
|
return vectorizeStoreChain(Chain.slice(0, TargetVF),
|
2019-11-15 21:49:35 +01:00
|
|
|
InstructionsProcessed) |
|
2016-10-03 12:31:34 +02:00
|
|
|
vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
|
2016-07-01 01:11:38 +02:00
|
|
|
}
|
|
|
|
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG({
|
2016-07-01 01:11:38 +02:00
|
|
|
dbgs() << "LSV: Stores to vectorize:\n";
|
2016-07-28 01:06:00 +02:00
|
|
|
for (Instruction *I : Chain)
|
|
|
|
dbgs() << " " << *I << "\n";
|
2016-07-07 22:10:35 +02:00
|
|
|
});
|
2016-07-01 01:11:38 +02:00
|
|
|
|
2016-07-13 23:20:01 +02:00
|
|
|
// We won't try again to vectorize the elements of the chain, regardless of
|
|
|
|
// whether we succeed below.
|
|
|
|
InstructionsProcessed->insert(Chain.begin(), Chain.end());
|
|
|
|
|
2016-07-01 01:11:38 +02:00
|
|
|
// If the store is going to be misaligned, don't vectorize it.
|
2021-02-05 04:22:04 +01:00
|
|
|
if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
|
2018-09-18 04:05:44 +02:00
|
|
|
if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
|
|
|
|
auto Chains = splitOddVectorElts(Chain, Sz);
|
2019-11-15 21:49:35 +01:00
|
|
|
return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
|
2018-09-18 04:05:44 +02:00
|
|
|
vectorizeStoreChain(Chains.second, InstructionsProcessed);
|
|
|
|
}
|
2016-07-11 22:46:17 +02:00
|
|
|
|
2020-04-21 05:32:05 +02:00
|
|
|
Align NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
|
|
|
|
Align(StackAdjustedAlignment),
|
|
|
|
DL, S0, nullptr, &DT);
|
|
|
|
if (NewAlign >= Alignment)
|
|
|
|
Alignment = NewAlign;
|
2020-02-10 16:30:34 +01:00
|
|
|
else
|
|
|
|
return false;
|
2018-09-18 04:05:44 +02:00
|
|
|
}
|
|
|
|
|
2020-06-26 16:14:27 +02:00
|
|
|
if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
|
2018-09-18 04:05:44 +02:00
|
|
|
auto Chains = splitOddVectorElts(Chain, Sz);
|
2019-11-15 21:49:35 +01:00
|
|
|
return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
|
2018-09-18 04:05:44 +02:00
|
|
|
vectorizeStoreChain(Chains.second, InstructionsProcessed);
|
2016-07-01 01:11:38 +02:00
|
|
|
}
|
|
|
|
|
2016-07-20 01:19:20 +02:00
|
|
|
BasicBlock::iterator First, Last;
|
|
|
|
std::tie(First, Last) = getBoundaryInstrs(Chain);
|
2016-07-01 01:11:38 +02:00
|
|
|
Builder.SetInsertPoint(&*Last);
|
|
|
|
|
|
|
|
Value *Vec = UndefValue::get(VecTy);
|
|
|
|
|
|
|
|
if (VecStoreTy) {
|
|
|
|
unsigned VecWidth = VecStoreTy->getNumElements();
|
|
|
|
for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
|
|
|
|
StoreInst *Store = cast<StoreInst>(Chain[I]);
|
|
|
|
for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
|
|
|
|
unsigned NewIdx = J + I * VecWidth;
|
|
|
|
Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
|
|
|
|
Builder.getInt32(J));
|
|
|
|
if (Extract->getType() != StoreTy->getScalarType())
|
|
|
|
Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
|
|
|
|
|
2016-07-07 22:10:35 +02:00
|
|
|
Value *Insert =
|
|
|
|
Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
|
2016-07-01 01:11:38 +02:00
|
|
|
Vec = Insert;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
|
|
|
|
StoreInst *Store = cast<StoreInst>(Chain[I]);
|
|
|
|
Value *Extract = Store->getValueOperand();
|
|
|
|
if (Extract->getType() != StoreTy->getScalarType())
|
2016-07-07 22:10:35 +02:00
|
|
|
Extract =
|
|
|
|
Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
|
2016-07-01 01:11:38 +02:00
|
|
|
|
2016-07-07 22:10:35 +02:00
|
|
|
Value *Insert =
|
|
|
|
Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
|
2016-07-01 01:11:38 +02:00
|
|
|
Vec = Insert;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-09-18 04:05:44 +02:00
|
|
|
StoreInst *SI = Builder.CreateAlignedStore(
|
|
|
|
Vec,
|
|
|
|
Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
|
|
|
|
Alignment);
|
2016-07-01 01:11:38 +02:00
|
|
|
propagateMetadata(SI, Chain);
|
|
|
|
|
|
|
|
eraseInstructions(Chain);
|
|
|
|
++NumVectorInstructions;
|
|
|
|
NumScalarsVectorized += Chain.size();
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2016-07-13 23:20:01 +02:00
|
|
|
bool Vectorizer::vectorizeLoadChain(
|
2016-07-28 01:06:00 +02:00
|
|
|
ArrayRef<Instruction *> Chain,
|
|
|
|
SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
|
2016-07-01 01:11:38 +02:00
|
|
|
LoadInst *L0 = cast<LoadInst>(Chain[0]);
|
2016-07-01 02:37:01 +02:00
|
|
|
|
|
|
|
// If the vector has an int element, default to int for the whole load.
|
2019-09-15 18:44:35 +02:00
|
|
|
Type *LoadTy = nullptr;
|
2016-07-01 02:37:01 +02:00
|
|
|
for (const auto &V : Chain) {
|
|
|
|
LoadTy = cast<LoadInst>(V)->getType();
|
|
|
|
if (LoadTy->isIntOrIntVectorTy())
|
|
|
|
break;
|
2016-07-01 03:55:52 +02:00
|
|
|
|
|
|
|
if (LoadTy->isPtrOrPtrVectorTy()) {
|
|
|
|
LoadTy = Type::getIntNTy(F.getParent()->getContext(),
|
|
|
|
DL.getTypeSizeInBits(LoadTy));
|
|
|
|
break;
|
|
|
|
}
|
2016-07-01 02:37:01 +02:00
|
|
|
}
|
2019-09-15 18:44:35 +02:00
|
|
|
assert(LoadTy && "Can't determine LoadInst type from chain");
|
2016-07-01 02:37:01 +02:00
|
|
|
|
2016-07-01 01:11:38 +02:00
|
|
|
unsigned Sz = DL.getTypeSizeInBits(LoadTy);
|
2016-07-01 04:07:22 +02:00
|
|
|
unsigned AS = L0->getPointerAddressSpace();
|
|
|
|
unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
|
2016-07-01 01:11:38 +02:00
|
|
|
unsigned VF = VecRegSize / Sz;
|
|
|
|
unsigned ChainSize = Chain.size();
|
2020-05-17 22:14:42 +02:00
|
|
|
Align Alignment = L0->getAlign();
|
2016-07-01 01:11:38 +02:00
|
|
|
|
2016-07-13 23:20:01 +02:00
|
|
|
if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
|
|
|
|
InstructionsProcessed->insert(Chain.begin(), Chain.end());
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2016-07-28 01:06:00 +02:00
|
|
|
ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
|
2016-07-20 01:19:20 +02:00
|
|
|
if (NewChain.empty()) {
|
2016-07-20 22:07:34 +02:00
|
|
|
// No vectorization possible.
|
2016-07-13 23:20:01 +02:00
|
|
|
InstructionsProcessed->insert(Chain.begin(), Chain.end());
|
|
|
|
return false;
|
|
|
|
}
|
2016-07-20 01:19:20 +02:00
|
|
|
if (NewChain.size() == 1) {
|
2016-07-13 23:20:01 +02:00
|
|
|
// Failed after the first instruction. Discard it and try the smaller chain.
|
2016-07-20 01:19:20 +02:00
|
|
|
InstructionsProcessed->insert(NewChain.front());
|
2016-07-01 01:11:38 +02:00
|
|
|
return false;
|
2016-07-13 23:20:01 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
// Update Chain to the valid vectorizable subchain.
|
2016-07-20 01:19:20 +02:00
|
|
|
Chain = NewChain;
|
2016-07-13 23:20:01 +02:00
|
|
|
ChainSize = Chain.size();
|
2016-07-01 01:11:38 +02:00
|
|
|
|
2016-10-03 12:31:34 +02:00
|
|
|
// Check if it's legal to vectorize this chain. If not, split the chain and
|
|
|
|
// try again.
|
2016-09-10 00:20:14 +02:00
|
|
|
unsigned EltSzInBytes = Sz / 8;
|
|
|
|
unsigned SzInBytes = EltSzInBytes * ChainSize;
|
2016-07-01 01:11:38 +02:00
|
|
|
VectorType *VecTy;
|
2020-08-27 20:19:46 +02:00
|
|
|
auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy);
|
2016-07-01 01:11:38 +02:00
|
|
|
if (VecLoadTy)
|
2020-05-29 19:06:26 +02:00
|
|
|
VecTy = FixedVectorType::get(LoadTy->getScalarType(),
|
|
|
|
Chain.size() * VecLoadTy->getNumElements());
|
2016-07-01 01:11:38 +02:00
|
|
|
else
|
2020-05-29 19:06:26 +02:00
|
|
|
VecTy = FixedVectorType::get(LoadTy, Chain.size());
|
2016-07-01 01:11:38 +02:00
|
|
|
|
2016-10-03 12:31:34 +02:00
|
|
|
// If it's more than the max vector size or the target has a better
|
|
|
|
// vector factor, break it into two pieces.
|
|
|
|
unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
|
|
|
|
if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
|
|
|
|
" Creating two separate arrays.\n");
|
2019-11-15 21:49:35 +01:00
|
|
|
return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
|
2016-10-03 12:31:34 +02:00
|
|
|
vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
|
2016-07-01 01:11:38 +02:00
|
|
|
}
|
|
|
|
|
2016-07-13 23:20:01 +02:00
|
|
|
// We won't try again to vectorize the elements of the chain, regardless of
|
|
|
|
// whether we succeed below.
|
|
|
|
InstructionsProcessed->insert(Chain.begin(), Chain.end());
|
|
|
|
|
2016-07-01 01:11:38 +02:00
|
|
|
// If the load is going to be misaligned, don't vectorize it.
|
2021-02-05 04:22:04 +01:00
|
|
|
if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
|
2018-09-18 04:05:44 +02:00
|
|
|
if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
|
|
|
|
auto Chains = splitOddVectorElts(Chain, Sz);
|
2019-11-15 21:49:35 +01:00
|
|
|
return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
|
2018-09-18 04:05:44 +02:00
|
|
|
vectorizeLoadChain(Chains.second, InstructionsProcessed);
|
|
|
|
}
|
2016-07-11 22:46:17 +02:00
|
|
|
|
2020-04-21 05:32:05 +02:00
|
|
|
Align NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
|
|
|
|
Align(StackAdjustedAlignment),
|
|
|
|
DL, L0, nullptr, &DT);
|
|
|
|
if (NewAlign >= Alignment)
|
|
|
|
Alignment = NewAlign;
|
2020-02-10 16:30:34 +01:00
|
|
|
else
|
|
|
|
return false;
|
2016-07-01 01:11:38 +02:00
|
|
|
}
|
|
|
|
|
2020-06-26 16:14:27 +02:00
|
|
|
if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
|
2018-09-18 04:05:44 +02:00
|
|
|
auto Chains = splitOddVectorElts(Chain, Sz);
|
2019-11-15 21:49:35 +01:00
|
|
|
return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
|
2018-09-18 04:05:44 +02:00
|
|
|
vectorizeLoadChain(Chains.second, InstructionsProcessed);
|
|
|
|
}
|
|
|
|
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG({
|
2016-07-01 01:11:38 +02:00
|
|
|
dbgs() << "LSV: Loads to vectorize:\n";
|
2016-07-28 01:06:00 +02:00
|
|
|
for (Instruction *I : Chain)
|
|
|
|
I->dump();
|
2016-07-07 22:10:35 +02:00
|
|
|
});
|
2016-07-01 01:11:38 +02:00
|
|
|
|
2016-07-20 01:19:20 +02:00
|
|
|
// getVectorizablePrefix already computed getBoundaryInstrs. The value of
|
|
|
|
// Last may have changed since then, but the value of First won't have. If it
|
|
|
|
// matters, we could compute getBoundaryInstrs only once and reuse it here.
|
|
|
|
BasicBlock::iterator First, Last;
|
|
|
|
std::tie(First, Last) = getBoundaryInstrs(Chain);
|
Correct ordering of loads/stores.
Summary:
Aiming to correct the ordering of loads/stores. This patch changes the
insert point for loads to the position of the first load.
It updates the ordering method for loads to insert before, rather than after.
Before this patch the following sequence:
"load a[1], store a[1], store a[0], load a[2]"
Would incorrectly vectorize to "store a[0,1], load a[1,2]".
The correctness check was assuming the insertion point for loads is at
the position of the first load, when in practice it was at the last
load. An alternative fix would have been to invert the correctness check.
The current fix changes insert position but also requires reordering of
instructions before the vectorized load.
Updated testcases to reflect the changes.
Reviewers: tstellarAMD, llvm-commits, jlebar, arsenm
Subscribers: mzolotukhin
Differential Revision: http://reviews.llvm.org/D22071
llvm-svn: 275117
2016-07-12 00:34:29 +02:00
|
|
|
Builder.SetInsertPoint(&*First);
|
2016-07-01 01:11:38 +02:00
|
|
|
|
|
|
|
Value *Bitcast =
|
2016-07-07 22:10:35 +02:00
|
|
|
Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
|
2020-01-23 11:33:12 +01:00
|
|
|
LoadInst *LI =
|
|
|
|
Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment));
|
2016-07-01 01:11:38 +02:00
|
|
|
propagateMetadata(LI, Chain);
|
|
|
|
|
|
|
|
if (VecLoadTy) {
|
|
|
|
SmallVector<Instruction *, 16> InstrsToErase;
|
|
|
|
|
|
|
|
unsigned VecWidth = VecLoadTy->getNumElements();
|
|
|
|
for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
|
|
|
|
for (auto Use : Chain[I]->users()) {
|
2016-07-28 01:06:00 +02:00
|
|
|
// All users of vector loads are ExtractElement instructions with
|
|
|
|
// constant indices, otherwise we would have bailed before now.
|
2016-07-01 01:11:38 +02:00
|
|
|
Instruction *UI = cast<Instruction>(Use);
|
|
|
|
unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
|
|
|
|
unsigned NewIdx = Idx + I * VecWidth;
|
2016-09-07 17:49:48 +02:00
|
|
|
Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
|
|
|
|
UI->getName());
|
2016-07-28 01:06:00 +02:00
|
|
|
if (V->getType() != UI->getType())
|
|
|
|
V = Builder.CreateBitCast(V, UI->getType());
|
2016-07-01 01:11:38 +02:00
|
|
|
|
|
|
|
// Replace the old instruction.
|
2016-07-28 01:06:00 +02:00
|
|
|
UI->replaceAllUsesWith(V);
|
2016-07-01 01:11:38 +02:00
|
|
|
InstrsToErase.push_back(UI);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-07-27 23:45:48 +02:00
|
|
|
// Bitcast might not be an Instruction, if the value being loaded is a
|
|
|
|
// constant. In that case, no need to reorder anything.
|
|
|
|
if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
|
|
|
|
reorder(BitcastInst);
|
2016-07-01 01:11:38 +02:00
|
|
|
|
|
|
|
for (auto I : InstrsToErase)
|
|
|
|
I->eraseFromParent();
|
|
|
|
} else {
|
|
|
|
for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
|
2016-07-28 01:06:00 +02:00
|
|
|
Value *CV = Chain[I];
|
2016-09-07 17:49:48 +02:00
|
|
|
Value *V =
|
|
|
|
Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
|
2016-07-28 01:06:00 +02:00
|
|
|
if (V->getType() != CV->getType()) {
|
|
|
|
V = Builder.CreateBitOrPointerCast(V, CV->getType());
|
2016-07-01 03:55:52 +02:00
|
|
|
}
|
2016-07-01 01:11:38 +02:00
|
|
|
|
|
|
|
// Replace the old instruction.
|
2016-07-28 01:06:00 +02:00
|
|
|
CV->replaceAllUsesWith(V);
|
2016-07-01 01:11:38 +02:00
|
|
|
}
|
|
|
|
|
2016-07-27 23:45:48 +02:00
|
|
|
if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
|
|
|
|
reorder(BitcastInst);
|
2016-07-01 01:11:38 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
eraseInstructions(Chain);
|
|
|
|
|
|
|
|
++NumVectorInstructions;
|
|
|
|
NumScalarsVectorized += Chain.size();
|
|
|
|
return true;
|
|
|
|
}
|
2016-07-11 22:46:17 +02:00
|
|
|
|
|
|
|
bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
|
2021-02-05 04:22:04 +01:00
|
|
|
Align Alignment) {
|
|
|
|
if (Alignment.value() % SzInBytes == 0)
|
2016-08-04 18:38:44 +02:00
|
|
|
return false;
|
2016-09-10 00:20:14 +02:00
|
|
|
|
2016-07-11 22:46:17 +02:00
|
|
|
bool Fast = false;
|
2016-08-04 18:38:44 +02:00
|
|
|
bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
|
|
|
|
SzInBytes * 8, AddressSpace,
|
2016-07-11 22:46:17 +02:00
|
|
|
Alignment, &Fast);
|
2018-05-14 14:53:11 +02:00
|
|
|
LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
|
|
|
|
<< " and fast? " << Fast << "\n";);
|
2016-08-04 18:38:44 +02:00
|
|
|
return !Allows || !Fast;
|
2016-07-11 22:46:17 +02:00
|
|
|
}
|